#### IMPORTING LIBRARIES

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### LOADING THE DATA

In [29]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

#### TREATING THE DATA

In [30]:
#### MASKING THE FEATURE 'GENDER' IN BOTH THE DATASETS ####
train_data['Gender'] = train_data['Gender'].map({'F':0, 'M':1}).astype(int)
test_data['Gender'] = test_data['Gender'].map({'F':0, 'M':1}).astype(int)

#### MASKING THE FEATURE 'CITY_CATEGORY' IN BOTH THE DATASETS ####
train_data['City_Category'] = train_data['City_Category'].map({'A':0, 'B':1, 'C':2}).astype(int)
test_data['City_Category'] = test_data['City_Category'].map({'A':0, 'B':1, 'C':2}).astype(int)

#### MASKING THE FEATURE 'STAY_IN_CURRENT_CITY_YEARS' IN BOTH THE DATASETS ####
train_data['Stay_In_Current_City_Years'] = train_data['Stay_In_Current_City_Years'].map({'0':0, '1':1, '2':2, '3':3, '4+':4}).astype(int)
test_data['Stay_In_Current_City_Years'] = test_data['Stay_In_Current_City_Years'].map({'0':0, '1':1, '2':2, '3':3, '4+':4}).astype(int)

#### MASKING THE FEATURE 'AGE' IN BOTH THE DATASETS ####
train_data['Age'] = train_data['Age'].map({'0-17': 0, '18-25': 1, '26-35': 2, '36-45': 3, '46-50': 4, '51-55': 5, '55+': 6}).astype(int)
test_data['Age'] = test_data['Age'].map({'0-17': 0, '18-25': 1, '26-35': 2, '36-45': 3, '46-50': 4, '51-55': 5, '55+': 6}).astype(int)

#### TREATING PRODUCT IDS 

In [31]:
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()

all_id = pd.DataFrame(train_data['Product_ID'].unique())
all_id = all_id.append(pd.DataFrame(test_data['Product_ID'].unique()))

final_id = pd.DataFrame(test_data['Product_ID'])

encoder.fit(all_id[all_id.columns[0]])
train_data['Product_ID'] = encoder.transform(train_data['Product_ID'])
test_data['Product_ID'] = encoder.transform(test_data['Product_ID'])

train_data = pd.concat([train_data, pd.get_dummies(train_data['Occupation'], prefix = 'Occupation_').astype(np.int8)], axis = 1)
test_data = pd.concat([test_data, pd.get_dummies(test_data['Occupation'], prefix = 'Occupation_').astype(np.int8)], axis = 1)

# train_data = pd.concat([train_data, pd.get_dummies(train_data['Product_ID'], prefix = 'Product_ID_', sparse = True).astype(np.int)], axis = 1)
# test_data = pd.concat([test_data, pd.get_dummies(test_data['Product_ID'], prefix = 'Product_ID_', sparse = True).astype(np.int)], axis = 1)

In [12]:
train_data.info()

<class 'pandas.sparse.frame.SparseDataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 33 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null int64
Gender                        550068 non-null int64
Age                           550068 non-null int64
Occupation                    550068 non-null int64
City_Category                 550068 non-null int64
Stay_In_Current_City_Years    550068 non-null int64
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
Occupation__0                 550068 non-null int8
Occupation__1                 550068 non-null int8
Occupation__2                 550068 non-null int8
Occupation__3                 550068 non-null int8
Occupation__4                 550068 non-null int

#### FEATURE ENGINEERING

In [32]:
#### CREATING A NEW FEATURE WHICH DESCRIBES THE NUMBER OF CATEGORIES THE PRODUCT IS IN ####
train_data['Multicategory'] = np.zeros((len(train_data['Age']),1))

train_data.loc[(train_data['Product_Category_2'].isnull()) | (train_data['Product_Category_3'].isnull()), 'Multicategory'] = 2
train_data.loc[(train_data['Product_Category_2'].isnull()) & (train_data['Product_Category_3'].isnull()), 'Multicategory'] = 1
train_data.loc[train_data['Multicategory'] == 0, 'Multicategory'] = 3

test_data['Multicategory'] = np.zeros((len(test_data['Age']),1))

test_data.loc[(test_data['Product_Category_2'].isnull()) | (test_data['Product_Category_3'].isnull()), 'Multicategory'] = 2
test_data.loc[(test_data['Product_Category_2'].isnull()) & (test_data['Product_Category_3'].isnull()), 'Multicategory'] = 1
test_data.loc[test_data['Multicategory'] == 0, 'Multicategory'] = 3

#### GAURAV KA IDEA

In [33]:
#### ONE HOT OR SOMETHING FEATURE REFINING ####

####             AGE                ####
train_data['Age(0-17)'] = np.zeros((len(train_data['Age']),1))
train_data['Age(18-25)'] = np.zeros((len(train_data['Age']),1))
train_data['Age(26-35)'] = np.zeros((len(train_data['Age']),1))
train_data['Age(36-45)'] = np.zeros((len(train_data['Age']),1))
train_data['Age(46-50)'] = np.zeros((len(train_data['Age']),1))
train_data['Age(51-55)'] = np.zeros((len(train_data['Age']),1))
train_data['Age(55+)'] = np.zeros((len(train_data['Age']),1))

train_data.loc[(train_data['Age'] == 0), 'Age(0-17)'] = 1
train_data.loc[(train_data['Age'] == 1), 'Age(18-25)'] = 1
train_data.loc[(train_data['Age'] == 2), 'Age(26-35)'] = 1
train_data.loc[(train_data['Age'] == 3), 'Age(36-45)'] = 1
train_data.loc[(train_data['Age'] == 4), 'Age(46-50)'] = 1
train_data.loc[(train_data['Age'] == 5), 'Age(51-55)'] = 1
train_data.loc[(train_data['Age'] == 6), 'Age(55+)'] = 1

test_data['Age(0-17)'] = np.zeros((len(test_data['Age']),1))
test_data['Age(18-25)'] = np.zeros((len(test_data['Age']),1))
test_data['Age(26-35)'] = np.zeros((len(test_data['Age']),1))
test_data['Age(36-45)'] = np.zeros((len(test_data['Age']),1))
test_data['Age(46-50)'] = np.zeros((len(test_data['Age']),1))
test_data['Age(51-55)'] = np.zeros((len(test_data['Age']),1))
test_data['Age(55+)'] = np.zeros((len(test_data['Age']),1))

test_data.loc[(test_data['Age'] == 0), 'Age(0-17)'] = 1
test_data.loc[(test_data['Age'] == 1), 'Age(18-25)'] = 1
test_data.loc[(test_data['Age'] == 2), 'Age(26-35)'] = 1
test_data.loc[(test_data['Age'] == 3), 'Age(36-45)'] = 1
test_data.loc[(test_data['Age'] == 4), 'Age(46-50)'] = 1
test_data.loc[(test_data['Age'] == 5), 'Age(51-55)'] = 1
test_data.loc[(test_data['Age'] == 6), 'Age(55+)'] = 1


####            MULTICLASS           ####
train_data['PC2'] = np.zeros((len(train_data['Age']),1))
train_data['PC3'] = np.zeros((len(train_data['Age']),1))
train_data.loc[(train_data['Product_Category_2'].notnull()), 'PC2'] = 1
train_data.loc[(train_data['Product_Category_3'].notnull()), 'PC3'] = 1

test_data['PC2'] = np.zeros((len(test_data['Age']),1))
test_data['PC3'] = np.zeros((len(test_data['Age']),1))
test_data.loc[(test_data['Product_Category_2'].notnull()), 'PC2'] = 1
test_data.loc[(test_data['Product_Category_3'].notnull()), 'PC3'] = 1

####          CITY CATEGORY            ####
train_data['CG1'] = np.zeros((len(train_data['Age']),1))
train_data['CG2'] = np.zeros((len(train_data['Age']),1))
train_data['CG3'] = np.zeros((len(train_data['Age']),1))
train_data.loc[(train_data['City_Category'] == 1), 'CG1'] = 1
train_data.loc[(train_data['City_Category'] == 2), 'CG2'] = 1
train_data.loc[(train_data['City_Category'] == 3), 'CG3'] = 1

test_data['CG1'] = np.zeros((len(test_data['Age']),1))
test_data['CG2'] = np.zeros((len(test_data['Age']),1))
test_data['CG3'] = np.zeros((len(test_data['Age']),1))
test_data.loc[(test_data['City_Category'] == 0), 'CG1'] = 1
test_data.loc[(test_data['City_Category'] == 1), 'CG2'] = 1
test_data.loc[(test_data['City_Category'] == 2), 'CG3'] = 1

####          STAY IN CURRENT YEARS      ####
train_data['ST0'] = np.zeros((len(train_data['Age']),1))
train_data['ST1'] = np.zeros((len(train_data['Age']),1))
train_data['ST2'] = np.zeros((len(train_data['Age']),1))
train_data['ST3'] = np.zeros((len(train_data['Age']),1))
train_data['ST4'] = np.zeros((len(train_data['Age']),1))
train_data.loc[(train_data['Stay_In_Current_City_Years'] == 0), 'ST0'] = 1
train_data.loc[(train_data['Stay_In_Current_City_Years'] == 1), 'ST1'] = 1
train_data.loc[(train_data['Stay_In_Current_City_Years'] == 2), 'ST2'] = 1
train_data.loc[(train_data['Stay_In_Current_City_Years'] == 3), 'ST3'] = 1
train_data.loc[(train_data['Stay_In_Current_City_Years'] == 4), 'ST4'] = 1

test_data['ST0'] = np.zeros((len(test_data['Age']),1))
test_data['ST1'] = np.zeros((len(test_data['Age']),1))
test_data['ST2'] = np.zeros((len(test_data['Age']),1))
test_data['ST3'] = np.zeros((len(test_data['Age']),1))
test_data['ST4'] = np.zeros((len(test_data['Age']),1))
test_data.loc[(test_data['Stay_In_Current_City_Years'] == 0), 'ST0'] = 1
test_data.loc[(test_data['Stay_In_Current_City_Years'] == 1), 'ST1'] = 1
test_data.loc[(test_data['Stay_In_Current_City_Years'] == 2), 'ST2'] = 1
test_data.loc[(test_data['Stay_In_Current_City_Years'] == 3), 'ST3'] = 1
test_data.loc[(test_data['Stay_In_Current_City_Years'] == 4), 'ST4'] = 1


#### DROPPING COLUMNS

In [34]:
#train_data = train_data.drop(['User_ID', 'Product_ID', 'Product_Category_2', 'Product_Category_3'], axis = 1)
#test_data = test_data.drop(['Product_Category_2', 'Product_Category_3'], axis = 1)

train_data = train_data.fillna(value = 0)
test_data = test_data.fillna(value = 0)

features = train_data.drop(['User_ID','Purchase'], axis = 1)
labels = train_data['Purchase']

final_features = test_data.drop(['User_ID'], axis = 1)

#### PCA

In [60]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 5)
pca = pca.fit(features)

features = pd.DataFrame(pca.transform(features))
final_features = pd.DataFrame(pca.transform(final_features))
final_features.head(10)

MemoryError: 

#### DIVIDING INTO TRAINING AND CROSS VALIDATION SETS

In [21]:
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.2, random_state = 0)

AttributeError: 'numpy.ndarray' object has no attribute 'sp_index'

#### IMPORTING ALL THE REGRESSORS

In [36]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

#### WORKING WITH THE REGRESSORS

In [10]:
clf = LinearRegression()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print(r2_score(labels_test, pred))

0.150972705832


In [20]:
clf = RandomForestRegressor(n_estimators = 100, min_samples_split= 50)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print(r2_score(labels_test, pred))

0.669913197655


In [None]:
parameters = {'n_estimators': (100, 250, 500), 'min_samples_split': (10, 25, 50, 75) }
rndm = RandomForestRegressor()
clf = GridSearchCV(rndm, parameters)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print(r2_score(labels_test, pred))
print(clf.best_params_)

In [8]:
clf = DecisionTreeRegressor()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print(r2_score(labels_test, pred))

0.633564446761


In [None]:
clf = AdaBoostRegressor(n_estimators = 700, learning_rate= 0.5)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print(r2_score(labels_test, pred))

In [20]:
clf = xgboost.XGBRegressor(n_estimators= 5000)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print(r2_score(labels_test, pred))

NameError: name 'features_train' is not defined

#### SUBMISSION

In [39]:
clf = xgboost.XGBRegressor(max_depth=10, n_estimators= 5000)
clf.fit(features, labels)
pred = clf.predict(final_features)


In [27]:
Arr = np.array(pred)
Arr.tolist()

In [40]:
submission = pd.DataFrame({"User_ID": test_data['User_ID'], "Product_ID": final_id['Product_ID'], "Purchase": Arr})
submission.to_csv('submission_one.csv')