# Dimensionality Reduction using PCA

Since feature engineering gave us 68 more columns, PCA was used to reduce dimensions.


In [95]:
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [96]:
wholedata = pd.read_csv("/resources/data/kdd/project/data/featureData.csv")

In [97]:
originaldata = pd.read_csv("/resources/data/kdd/project/data/fiveclasses_train.csv")

In [98]:
wholedata.head()

Unnamed: 0,VisitNumber,TripType,Weekday,NumItems,Return,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
0,43,38,5,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,63,36,5,5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,83,36,5,9,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,86,37,5,22,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,97,38,5,13,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
wholedata.shape

(15195, 72)

## Train Test Split

In [100]:
from sklearn.cross_validation import train_test_split
train, test = train_test_split(wholedata, test_size = 0.3)

In [101]:
train_array = train.values
test_array = test.values

In [102]:
X_train = train_array[:,2:]
Y_train = train_array[:,1]
X_test = test_array[:,2:]
Y_test = test_array[:,1]

## PCA to fit the train data

In [103]:
pca = PCA()
fit = pca.fit(X_train)

## Random Forest Classifier

In [104]:
X_t_train = pca.transform(X_train)
X_t_test = pca.transform(X_test)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_t_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [105]:
pred_test =  clf.predict(X_t_test)
pred_train = clf.predict(X_t_train)

In [106]:
pd.crosstab(Y_test,pred_test , rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,3.0,5.0,36.0,37.0,38.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3.0,980,18,8,0,4,1010
5.0,12,770,86,23,43,934
36.0,7,79,774,16,34,910
37.0,0,6,19,728,84,837
38.0,2,26,24,86,730,868
All,1001,899,911,853,895,4559


In [107]:
pd.crosstab(Y_test,predictions , rownames=['True'], colnames=['Predicted'], margins=True)
print(classification_report(Y_test, predictions))

             precision    recall  f1-score   support

        3.0       0.23      0.23      0.23      1010
        5.0       0.19      0.20      0.20       934
       36.0       0.21      0.21      0.21       910
       37.0       0.20      0.19      0.19       837
       38.0       0.20      0.20      0.20       868

avg / total       0.21      0.21      0.21      4559



In [108]:
from sklearn.metrics import accuracy_score, classification_report

### Training and Test accuracy

In [109]:
accuracy_score(Y_train, pred_train), accuracy_score(Y_test, pred_test)

(0.98401654757427603, 0.87343715727133142)

## Logistic Regression Classifier

In [110]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [114]:
lr.fit(X = X_t_train, y = Y_train)
predictions = lr.predict(X_t_test)
pred_train = lr.predict(X_t_train)
pd.crosstab(Y_test,predictions , rownames=['True'], colnames=['Predicted'], margins=True)


Predicted,3.0,5.0,36.0,37.0,38.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3.0,986,13,9,1,1,1010
5.0,15,796,74,22,27,934
36.0,7,90,779,12,22,910
37.0,4,9,18,728,78,837
38.0,6,29,26,67,740,868
All,1018,937,906,830,868,4559


## Training and Test accuracy

In [115]:
accuracy_score(Y_train, pred_train), accuracy_score(Y_test,predictions)

(0.90456938698758937, 0.88374643562184685)

In [116]:
pd.crosstab(Y_test,predictions , rownames=['True'], colnames=['Predicted'], margins=True)
print(classification_report(Y_test, predictions))

             precision    recall  f1-score   support

        3.0       0.97      0.98      0.97      1010
        5.0       0.85      0.85      0.85       934
       36.0       0.86      0.86      0.86       910
       37.0       0.88      0.87      0.87       837
       38.0       0.85      0.85      0.85       868

avg / total       0.88      0.88      0.88      4559

