In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
print("Shape of data\n train = {} \n test ={}".format(train_df.shape,test_df.shape))

Shape of data
 train = (550068, 12) 
 test =(233599, 11)


In [None]:
train_df.dtypes

In [None]:
test_df.dtypes

In [None]:
train_df.head()

In [4]:
train_df.isna().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [None]:
test_df.isna().sum()

In [None]:
train_df.isna().sum()/len(train_df)*100

In [None]:
test_df.isna().sum()/len(test_df)*100

In [5]:
train_df = train_df.fillna(value=0)
test_df = test_df.fillna(value=0)

In [6]:
train_df.isna().sum()

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

In [None]:
test_df.isna().sum()

In [7]:
# convert Product_Category_2 and Product_Category_3 to int64 dtype

train_df['Product_Category_2'] = train_df['Product_Category_2'].astype('int64')
train_df['Product_Category_3'] = train_df['Product_Category_3'].astype('int64')

test_df['Product_Category_2'] = test_df['Product_Category_2'].astype('int64')
test_df['Product_Category_3'] = test_df['Product_Category_3'].astype('int64')



In [8]:
cat_cols = list(test_df.columns)

train_df[cat_cols] = train_df[cat_cols].astype('category')
test_df[cat_cols] = test_df[cat_cols].astype('category')

In [9]:
train_df.describe()

Unnamed: 0,Purchase
count,550068.0
mean,9263.968713
std,5023.065394
min,12.0
25%,5823.0
50%,8047.0
75%,12054.0
max,23961.0


In [10]:
train_df.describe(include='category')

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
count,550068,550068,550068,550068,550068,550068,550068,550068,550068,550068,550068
unique,5891,3631,2,7,21,3,5,2,20,18,16
top,1001680,P00265242,M,26-35,4,B,1,0,5,0,0
freq,1026,1880,414259,219587,72308,231173,193821,324731,150933,173638,383247


In [None]:
test_df.describe(include='category')

In [None]:
cat_cols

In [None]:
imp_cols = cat_cols.copy()

imp_cols.remove('User_ID')
imp_cols.remove('Product_ID')

imp_cols

In [None]:

for i in imp_cols:
    print('\n\n',i)
    print('train_df')
    print(list(sorted(train_df[i].unique())))
    print('test_df')
    print(list(sorted(test_df[i].unique())))

From above we can see that __Product_Category_1__ in test data dosent has __19, 20__ values.

In [None]:
train_df_unique_users = list(train_df.User_ID.unique())
test_df_unique_users = list(test_df.User_ID.unique())

train_df_unique_products = list(train_df.Product_ID.unique())
test_df_unique_products = list(test_df.Product_ID.unique())


In [None]:
print('Number of new users in test data')
print(len([x for x in train_df_unique_users if x not in test_df_unique_users]))

In [None]:
print('Number of products in train but not in test data')
print(len([x for x in train_df_unique_products if x not in test_df_unique_products]))

In [None]:
print('Number of unknown products in test data or \nNumber of products in test but not in train')
print(len([x for x in test_df_unique_products if x not in train_df_unique_products]))

So there are 46 new unknown products in test data

In [None]:
train_df.columns

## Data Viz

In [None]:
sns.scatterplot(train_df.Occupation,train_df.Purchase, alpha=0.1,hue=train_df.Gender)

In [None]:
temp = pd.crosstab(train_df.Gender,train_df.Occupation)
temp

In [None]:
temp = pd.crosstab(train_df.Gender,train_df.Occupation)
temp_x = temp/temp.sum()*100
temp_x.transpose()

We can observe that in Occupation 9 most of them are Female

In [None]:
sns.scatterplot(train_df.Age,train_df.Purchase, alpha=0.1,hue=train_df.Gender)

In [None]:
plt.figure(figsize=(15,10))
sns.boxenplot(x=train_df.Occupation,y=train_df.Purchase, hue=train_df.Gender)

Above plot shows except Occupation 18 in all Males are purchasing more

In [None]:
plt.figure(figsize=(15,10))
sns.boxenplot(x=train_df.Age,y=train_df.Purchase, hue=train_df.Gender)

In [None]:
plt.figure(figsize=(15,10))
sns.boxenplot(x=train_df.Occupation,y=train_df.Purchase, hue=train_df.Marital_Status)

Looks like Marital_Status has no effect on purchasing power

In [None]:
# Box plot of Purchase vs Gender
sns.set(style="whitegrid")
sns.boxplot(x=train_df.Gender,y=train_df.Purchase)

Males are purchasing of more amount

In [None]:
# Box plot of Purchase vs Age
sns.set(style="whitegrid")
sns.boxplot(x=train_df.Age,y=train_df.Purchase)

In [None]:
# Box plot of Purchase vs Occupation
sns.set(style="whitegrid")
sns.boxplot(x=train_df.Occupation,y=train_df.Purchase)

In [None]:
# Box plot of Purchase vs City_Category
sns.set(style="whitegrid")
sns.boxplot(x=train_df.City_Category,y=train_df.Purchase)

City C are purchasing of more amount

In [None]:
# Box plot of Purchase vs Stay_In_Current_City_Years
sns.set(style="whitegrid")
sns.boxplot(x=train_df.Stay_In_Current_City_Years,y=train_df.Purchase)

In [None]:
# Box plot of Purchase vs Marital_Status
sns.set(style="whitegrid")
sns.boxplot(x=train_df.Marital_Status,y=train_df.Purchase)

In [None]:
males = train_df[train_df.Gender=='M']
females = train_df[train_df.Gender=='F']

In [None]:
males.shape

In [None]:
sns.scatterplot(males.Occupation,males.Purchase, hue=males.Marital_Status, alpha=0.1)

In [11]:
train_df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,0,0,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6,14,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,0,0,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14,0,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,0,0,7969


In [12]:
test_df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11,0
1,1000009,P00113442,M,26-35,17,C,0,0,3,5,0
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14,0
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9,0
4,1000011,P00053842,F,26-35,1,C,1,0,4,5,12


In [13]:
train = train_df.copy()
test = test_df.copy()

In [14]:
train.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
le_user_id = LabelEncoder()
le_product_id = LabelEncoder()
le_gender =  LabelEncoder()
le_age = LabelEncoder()
le_city = LabelEncoder()
le_stay = LabelEncoder()

In [17]:
# fit train
train.User_ID = le_user_id.fit_transform(train.User_ID)
train.Product_ID = le_product_id.fit_transform(train.Product_ID)
train.Gender = le_gender.fit_transform(train.Gender)
train.Age = le_age.fit_transform(train.Age)
train.City_Category = le_city.fit_transform(train.City_Category)
train.Stay_In_Current_City_Years = le_stay.fit_transform(train.Stay_In_Current_City_Years)

In [18]:
test['Product_ID'] = test_df['Product_ID'].map(lambda s: '<unknown>' if s not in le_product_id.classes_ else s)
le_product_id.classes_ = np.append(le_product_id.classes_, '<unknown>')
test.Product_ID = le_product_id.transform(test.Product_ID)

In [19]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0,672,0,0,10,0,2,0,3,0,0,8370
1,0,2376,0,0,10,0,2,0,1,6,14,15200
2,0,852,0,0,10,0,2,0,12,0,0,1422
3,0,828,0,0,10,0,2,0,12,14,0,1057
4,1,2734,1,6,16,2,4,0,8,0,0,7969


In [20]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,1196,M,46-50,7,B,2,1,1,11,0
1,1000009,1043,M,26-35,17,C,0,0,3,5,0
2,1000010,2764,F,36-45,1,B,4+,1,5,14,0
3,1000010,1358,F,36-45,1,B,4+,1,4,9,0
4,1000011,529,F,26-35,1,C,1,0,4,5,12


In [21]:
# test
test.User_ID = le_user_id.transform(test.User_ID)
test.Gender = le_gender.transform(test.Gender)
test.Age = le_age.transform(test.Age)
test.City_Category = le_city.transform(test.City_Category)
test.Stay_In_Current_City_Years = le_stay.transform(test.Stay_In_Current_City_Years)

In [22]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,3,1196,1,4,7,1,2,1,1,11,0
1,8,1043,1,2,17,2,0,0,3,5,0
2,9,2764,0,3,1,1,4,1,5,14,0
3,9,1358,0,3,1,1,4,1,4,9,0
4,10,529,0,2,1,2,1,0,4,5,12


In [23]:
train.columns


Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [24]:
# split data
from sklearn.model_selection import train_test_split

In [25]:
y = train.Purchase
X = train.drop('Purchase',1)

In [26]:
X_train,X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2, random_state=42)

In [27]:
X_train.shape

(440054, 11)

In [28]:
X_valid.shape

(110014, 11)

In [29]:
y_train.shape

(440054,)

In [30]:
y_valid.shape

(110014,)

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
lin_reg = LinearRegression()

In [33]:
lin_reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [34]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def display_score(y_actual, y_predicted):
    rms = sqrt(mean_squared_error(y_actual, y_predicted))
    return rms

In [35]:
display_score(y_train,lin_reg.predict(X_train))

4615.09137610716

In [36]:
display_score(y_valid,lin_reg.predict(X_valid))

4606.373792522189

In [37]:
test_pred = lin_reg.predict(test)

In [38]:
test_pred

array([10686.06747823, 10178.66019576,  8193.36136831, ...,
       11968.14226871,  7528.00492463,  8524.70783761])

In [None]:
test_df.head()

In [39]:
test_pred = pd.DataFrame(test_pred)
test_pred

Unnamed: 0,0
0,10686.067478
1,10178.660196
2,8193.361368
3,8954.709143
4,11074.434246
5,11915.757856
6,12954.766311
7,11029.785718
8,9181.489063
9,8962.503744


In [40]:
test_pred.to_csv('test_pred.csv')

In [None]:
test_df['User_ID'] = test_df['User_ID'].astype('str')
test_df['Product_ID'] = test_df['Product_ID'].astype('str')


In [None]:
comb