## Step 0: Import libraries

In [None]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np
%matplotlib inline

## Step 1: Data preparation (data loading, cleaning,convertion,exploration...)

### 1.1 Load data (train & test datasets) and append test dataset to train dataset ,making a single dataframe

In [None]:
# Define 'columns' to be used to set dataframe's columns
columns=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']
# Skip the first row. 'header' set to None as there is no feature names in the file. We use 'name=' to set dataframe's columns.
df=pd.read_csv('adult.data',header=None,names=columns)
print(df.columns)
print(df.shape)
df_test=pd.read_csv('adult.test',skiprows=[0],header=None,names=columns)
df=df.append(df_test,ignore_index=True)
df.shape

### 1.2 Strip white spaces and trailing dot for some of feature columns

In [None]:
# strip white space and trailing dot from  for some columns!
df['income']=df['income'].apply(lambda x: x.replace(' ','').replace('.',''))
df['sex']=df['sex'].apply(lambda x: x.replace(' ',''))
df['race']=df['race'].apply(lambda x: x.replace(' ',''))
df['relationship']=df['relationship'].apply(lambda x: x.replace(' ',''))
df['occupation']=df['occupation'].apply(lambda x: x.replace(' ',''))
df['education']=df['education'].apply(lambda x: x.replace(' ',''))
df['marital-status']=df['marital-status'].apply(lambda x: x.replace(' ',''))
df['native-country']=df['native-country'].apply(lambda x: x.replace(' ',''))
# add a new column of boolean type 
df['income_greater_50K']=df.income=='>50K'

### Some data exploration

In [None]:
table=pd.pivot_table(df,index=['occupation','sex'],values=['age'],aggfunc=[np.mean])
table

In [None]:
df['dumb'] = int(1)
table=pd.pivot_table(df,index=['race'],columns=['education'],values=['dumb'],aggfunc=[np.sum],margins=True,fill_value=0)
df.drop('dumb',axis=1,inplace=True)
table

In [None]:
df['dumb'] = int(1)
table=pd.pivot_table(df,index=['sex'],columns=['income'],values=['dumb'],aggfunc=[np.sum],margins=True)
df.drop('dumb',axis=1,inplace=True)
table

In [None]:
table1=table['sum']['dumb']
table1.loc['Female ratio']=table1.loc['Female']/table1.loc['All']
print("Women percentage: {2} <=50K percentage :{0},>50K percentage :{1}".\
      format(table1.loc['Female ratio','<=50K'],table1.loc['Female ratio','>50K'],table1.loc['Female ratio','All']))

In [None]:
# Pie chart to display the proportion of data samples in each individual 'occupation' category. 
df['occupation'].value_counts().plot(kind='pie')

In [None]:
print(df[(df.age>=27)&(df.race=='Black')&(df['income_greater_50K']==True)].shape[0],df[(df.age>=27)&(df.race=='Black')].shape[0])
print(df[(df.age>=27)&(df.race=='White')&(df['income_greater_50K']==True)].shape[0],df[(df.age>=27)&(df.race=='White')].shape[0])

In [None]:
print df.describe(include=[np.object])
df.isnull().describe()   # check to see if there are any NaN s

In [None]:
# Stacked bar chart to show  'income' breakdown per 'sex' 
a=df.groupby(['sex','income']).size()
b=a.unstack()
print(b, b.index,b.columns)
#b.plot(kind='bar',stacked=True)
b.plot(kind='bar',stacked=True)

In [None]:
# Stacked bar chart to show  'sex' breakdown per 'occupation' 
a=df.groupby(['occupation','sex']).size()
b=a.unstack()
b.plot(kind='bar',stacked=True)
# Stacked bar chart to show  'occupation' breakdown per 'sex' 
df.groupby(['sex','occupation']).size().unstack().plot(kind='barh',stacked=True)

In [None]:
df.describe()

In [None]:
# Histogram to display 'hours-per-week' distribution per 'sex'
df.groupby(['sex'])['hours-per-week'].plot(kind='hist',legend=True,alpha=.5,bins=np.arange(start=0,stop=100,step=4))

In [None]:
# Figure out  ratio of '>=50K' income in each gender 
a= df.groupby(['sex','income']).size()
print("Ratio of women whose income is less than $50K: {}".\
      format(a.loc['Female'].loc['<=50K'].astype('float') / a.loc['Female'].astype('float').sum()))
print("Ratio of women whose income is greater than $50K: {}".\
      format(a.loc['Female'].loc['>50K'].astype('float') / a.loc['Female'].astype('float').sum()))

print("Ratio of men whose income is less than $50K: {}".\
      format(a.loc['Male'].loc['<=50K'].astype('float') / a.loc['Male'].astype('float').sum()))
print("Ratio of men whose income is greater than $50K: {}".\
      format(a.loc['Male'].loc['>50K'].astype('float') / a.loc['Male'].astype('float').sum()))

### 1.3 Drop `fnlwgt` feature column and convert all non-numerical features to numerical ones  

In [None]:
df.drop('fnlwgt',axis=1,inplace=True)
target=df['income_greater_50K'].astype('int')
features=df.drop(['income','income_greater_50K'],axis=1)
features.dtypes
features_numeric=pd.get_dummies(features)
features_numeric.columns

### 1.4 Split the single dataframe into a dataframe storing train data  and a dataframe storing test data

In [None]:

train_features= features_numeric.iloc[:32561]
test_features= features_numeric.iloc[32561:]
train_target= target.iloc[:32561]
test_target= target.iloc[32561:]
"""
train_features= features_numeric.iloc[:]
train_target= target.iloc[:]
"""

## Step 2: Cross-validation (against train dataset)

### 2.1a Cross validation on `SVC` classifier

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
scaler=StandardScaler()
clf = SVC(C=1.0,cache_size=300,kernel='rbf')
pipeline= Pipeline([('transformer',scaler),('estimator',clf)])
#pipeline= Pipeline([('estimator',clf)])
#clf=SVC(kernel='poly',cache_size=500)
scores= cross_val_score(pipeline,train_features.values,train_target.values,cv=5,n_jobs=-1,verbose=2)
print(scores)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(),scores.std()*2))
"""
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
"""

### 2.1b Cross validation on `Adaboost` classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
#clf = RandomForestClassifier(n_estimators=500)
#clf=DecisionTreeClassifier(max_depth=5)
clf= AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=400)
scores= cross_val_score(clf,train_features.values,train_target.values,cv=8,n_jobs=-1,verbose=9)
print(scores)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(),scores.std()*2))

### 2.1c Cross validation on `LogisticRegression` classifier

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scaler=StandardScaler()
clf = LogisticRegression(solver='lbfgs')
pipeline= Pipeline([('transformer',scaler),('estimator',clf)])
#pipeline= Pipeline([('estimator',clf)])
scores= cross_val_score(pipeline,train_features.values,train_target.values,cv=8,n_jobs=-1,verbose=2)
print(scores)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(),scores.std()*2))


## Step 3:Learn a classifier from train dataset and test it against test dataset

### 3.1a Mini-batch gradient descend method:Train Logistic Regression classifier with `SGDClassifer`

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
import random
# Generator of series of lists each of which is at most 'n' in length
def batches(l, n):
    for i in xrange(0, len(l), n):
        yield l[i:i+n]
scaler=StandardScaler()
# To normalize the input features by 'fit' first and 'transform' later. 'scalerX' stores normalized feature inputs of the train data set.
scaler.fit(train_features.values)
scalerX =scaler.transform(train_features.values)
# Define a logistic regression classifier
clf = SGDClassifier(loss='log') # shuffle=True by default
shuffledRange = range(len(train_target.values))
# 'epochs': number of passes of the trian data set for the training
# 'batch_size': Mini-batch size
epochs = 50
batch_size=4000
# Mini-batch gradient descend optimization loop
for n in range(epochs):
    random.shuffle(shuffledRange)
    shuffledX = [scalerX[i] for i in shuffledRange]
    shuffledY = [train_target.values[i] for i in shuffledRange]
    for batch in batches(range(len(shuffledX)), batch_size):
        clf.partial_fit(shuffledX[batch[0]:batch[-1]+1], shuffledY[batch[0]:batch[-1]+1], classes=np.unique(train_target.values))

### 3.1b Stochastic gradient descend method :Train Logistic Regression classifier with `SGDClassifer`

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
scaler=StandardScaler()
scaler.fit(train_features.values)
scalerX =scaler.transform(train_features.values)
clf = SGDClassifier(loss='log',max_iter=10,shuffle=True) # shuffle=True is useless here
clf.fit(scalerX, train_target.values)

### 3.1c Train a `LogisticRegression` classifier

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

scaler=StandardScaler()
scaler.fit(train_features.values)
scalerX =scaler.transform(train_features.values)
clf = LogisticRegression(solver='lbfgs')
clf.fit(scalerX, train_target.values)

### 3.2 Test the classifier on test dataset

In [None]:
from sklearn.metrics import classification_report
results_predicted = clf.predict(scaler.transform(test_features.values))
print(classification_report(test_target.values, results_predicted))
compare = results_predicted!= test_target.values
print("Test error ratio: {}".format(float(np.sum(compare.astype('int')))/float(np.size(compare.astype('int')))))