# 0. Prequisite

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('lab1.csv', index_col=0)

In [3]:
def preprocessing_data(df):
    df = df.drop(['TargetD', 'GiftCnt36', 'GiftCntAll', 'GiftCntCard36', 'GiftCntCardAll', 'GiftAvgLast', 'GiftAvg36','GiftAvgAll','GiftTimeLast','GiftTimeFirst'], axis=1)
    
    mask = pd.isnull(df['DemAge'])

    df = df[~mask]
    
    df['DemAge'].fillna(df['DemAge'].mean(), inplace=True)
    

    df['DemMedIncome'] = pd.isnull(df['DemMedIncome'])
    df['GiftAvgCard36'] = pd.isnull(df['GiftAvgCard36'])
    df['DemMedIncome'].fillna(0, inplace=True)
    df['GiftAvgCard36'].fillna(0, inplace=True)
    
    df = df.drop(['PromCnt12', 'PromCnt36','PromCntAll','PromCntCard12','PromCntCard36','PromCntCardAll'], axis=1) 
    df = df.dropna()
    
    df = pd.get_dummies(df)
    
    return df

In [4]:
df = preprocessing_data(df)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7279 entries, 0 to 0
Data columns (total 19 columns):
ID                  7279 non-null int64
GiftAvgCard36       7279 non-null bool
StatusCatStarAll    7279 non-null int64
DemCluster          7279 non-null int64
DemAge              7279 non-null float64
DemMedHomeValue     7279 non-null int64
DemPctVeterans      7279 non-null int64
DemMedIncome        7279 non-null bool
StatusCat96NK_A     7279 non-null uint8
StatusCat96NK_E     7279 non-null uint8
StatusCat96NK_F     7279 non-null uint8
StatusCat96NK_L     7279 non-null uint8
StatusCat96NK_N     7279 non-null uint8
StatusCat96NK_S     7279 non-null uint8
DemGender_F         7279 non-null uint8
DemGender_M         7279 non-null uint8
DemGender_U         7279 non-null uint8
DemHomeOwner_H      7279 non-null uint8
DemHomeOwner_U      7279 non-null uint8
dtypes: bool(2), float64(1), int64(5), uint8(11)
memory usage: 490.5 KB


In [6]:
# split the data into training and testing
X = df.drop(['GiftAvgCard36'], axis=1)
y = df['GiftAvgCard36']

X_mat = X.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.3, stratify=y, random_state=0)

  """


# 1. Standardisation and Logistic Regression

a) What is the difference between logistic regression and linear regression? 

Logistic regression is used to predicting dependent variables that are binary in class.
Linear regression is used when the dependent variable is in continuous value.

b) Describe how logistic regression perform its prediction

Logistic regression computes the probability of an event occurrence by using a log of odds as the dependent variable.

c) Write code to perform standardisation on your training and test dataset.

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
X_train

array([[-0.69402919, -1.12179409,  1.06177714, ..., -0.18635413,
        -1.42195128,  1.42195128],
       [-0.0070011 ,  0.89142919,  0.3013199 , ..., -0.18635413,
         0.70325898, -0.70325898],
       [ 1.67362395, -1.12179409,  1.61483695, ..., -0.18635413,
        -1.42195128,  1.42195128],
       ...,
       [ 0.3590343 ,  0.89142919, -0.87393219, ..., -0.18635413,
         0.70325898, -0.70325898],
       [ 0.57430671,  0.89142919,  0.09392247, ..., -0.18635413,
         0.70325898, -0.70325898],
       [ 0.39406351, -1.12179409, -0.18260743, ..., -0.18635413,
         0.70325898, -0.70325898]])

In [9]:
X_test

array([[ 1.49818868,  0.89142919,  0.92351219, ..., -0.18635413,
         0.70325898, -0.70325898],
       [ 0.55632216,  0.89142919, -0.94306467, ...,  5.36612725,
         0.70325898, -0.70325898],
       [ 1.47196195,  0.89142919,  1.13090962, ..., -0.18635413,
        -1.42195128,  1.42195128],
       ...,
       [-0.10860028, -1.12179409, -0.87393219, ..., -0.18635413,
         0.70325898, -0.70325898],
       [-0.76678078, -1.12179409,  1.26917457, ..., -0.18635413,
         0.70325898, -0.70325898],
       [ 1.11033683,  0.89142919, -0.94306467, ..., -0.18635413,
         0.70325898, -0.70325898]])

d) What does standardisation do to your data? How does it benefit your regression model? 

Standardisation scale input features with normal distribution mean of 0 and standard deviation of 1. Standardisation for regression models can reduce the time in optimizing the weights during gradient descent.

e) Write code to fit a logistic regression model to your training data. How does it perform on the training and test data? Do you see any indication of overfitting?  

In [10]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print("Training:", logreg.score(X_train, y_train))
print("Test:", logreg.score(X_test, y_test))

Training: 0.8143277723258097
Test: 0.8067765567765568




Both training and testing accuracy are high and almost similar, implies that the model is not overfitting.

f) Write code to find the most important features in your model.

In [11]:
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import classification_report

# grid search CV
params = {'C': [pow(10, x) for x in range(-6, 4)]}

# use all cores to tune logistic regression with C parameter
cv = GridSearchCV(param_grid=params, estimator=LogisticRegression(random_state=0), cv=10, n_jobs=-1)
cv.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10,
                               100, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [12]:
coef = cv.best_estimator_.coef_[0]
feature_names = X.columns

# sort them out in descending order
indices = np.argsort(np.absolute(coef))
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:20]

for i in indices:
    print(feature_names[i], ':', coef[i])

StatusCat96NK_S : -0.00039595519485191333
StatusCat96NK_A : 0.00036455221936939876
StatusCatStarAll : -0.00029305025473820015
StatusCat96NK_E : 0.0001669022052626906
StatusCat96NK_N : -0.00012473109368537686
StatusCat96NK_L : 9.422832714177853e-05
DemPctVeterans : -7.218149157640623e-05
DemAge : -4.588185191022331e-05
DemGender_M : 4.035605359256606e-05
DemGender_F : -3.1544269144665554e-05
DemMedHomeValue : 2.5075159520180007e-05
DemGender_U : -2.3395237073187303e-05
DemCluster : 1.6201779797031502e-05
ID : -7.047181436590488e-06
StatusCat96NK_F : -6.213725640201492e-06
DemHomeOwner_U : 7.546056781895748e-07
DemHomeOwner_H : -7.546056781894461e-07
DemMedIncome : 0.0


StatusCat96NK_L consider as the most important feature, followed by DemHomeOwner_U and DemGender_M.