In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set()

# Features and Labels
Adults income data is a data set available on UCI Repository. It states about the income of adults depending on different features. We will work on this data using Different Models and see how our models will classify. For now we will just use Logistic Regression and Support Vector Machine.
You can read about data features from this [Link](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)

In [74]:
cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','class']
df = pd.read_csv(r'C:\Users\Siam Cheema\Downloads\Documents\adults.train.txt', names = cols)

In [3]:
df.shape

(11416, 15)

In [75]:
df.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
education-num     float64
marital-status     object
occupation         object
relationship       object
race               object
sex                object
capital-gain      float64
capital-loss      float64
hours-per-week    float64
native-country     object
class              object
dtype: object

In [77]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         1
education-num     1
marital-status    1
occupation        1
relationship      1
race              1
sex               1
capital-gain      1
capital-loss      1
hours-per-week    1
native-country    1
class             1
dtype: int64

In [4]:
df = df.dropna() #droping the missing value samples

In [5]:
df.shape

(11415, 15)

In [6]:
obt_df = df.select_dtypes('object') #selecting the categorical data

In [7]:
obt_df_with_class = obt_df.iloc[:,:-1] #selecting just the features

In [8]:
df_dummies = pd.get_dummies(obt_df_with_class) #one hot encoding the categorical variables

In [9]:
df_dummies.shape

(11415, 101)

In [10]:
df1 = df.select_dtypes(['int64', 'float64'])

In [11]:
df1.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39,77516,13.0,2174.0,0.0,40.0
1,50,83311,13.0,0.0,0.0,13.0
2,38,215646,9.0,0.0,0.0,40.0
3,53,234721,7.0,0.0,0.0,40.0
4,28,338409,13.0,0.0,0.0,40.0


In [12]:
df_final = pd.concat([df_dummies,df1],axis=1)

In [13]:
df_final1 = df_final.iloc[:,1:]

In [14]:
df_final1.shape

(11415, 106)

In [15]:
df['class'].value_counts()

 <=50K    8694
 >50K     2721
Name: class, dtype: int64

In [16]:
df['class'][df['class'].str.contains("<") == True] = 0 #labeling <=50k as 0

In [17]:
df['class'][df['class'].str.contains("<") == False] = 1 #labeling >50k as 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,0
2,38,Private,215646,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,0
3,53,Private,234721,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,0
4,28,Private,338409,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,0


In [19]:
y = df['class'].values
y = y.astype('int64') #converting the type to integer

In [20]:
X = df_final1.values

In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [55]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [56]:
lr = LogisticRegression(solver='lbfgs', max_iter=200)
svm = SVC(gamma='auto')
nb = GaussianNB()

In [62]:
def print_score(model, X_train, y_train, X_test, y_test, train=True):
    '''
    Taining Performence
    '''
    if train:
        print("Train Result:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, model.predict(X_train))))
        print("Classification Report: \n {} \n".format(classification_report(y_train,model.predict(X_train))))
        print("Confusion Matrix:")
        print(confusion_matrix(y_train, model.predict(X_train)))
        
        res = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
    
    elif train == False:
        '''
        Testing Performence
        '''
        print("Test Results:\n")
        print("Accuracy Score: {0:.4f}".format(accuracy_score(y_test, model.predict(X_test))))
        print("Classification Report: \n {} \n".format(classification_report(y_test,model.predict(X_test))))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, model.predict(X_test)))

In [63]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [64]:
print_score(lr, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy Score: 0.8504

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.93      0.90      6516
           1       0.73      0.59      0.65      2045

   micro avg       0.85      0.85      0.85      8561
   macro avg       0.80      0.76      0.78      8561
weighted avg       0.84      0.85      0.84      8561
 

Confusion Matrix:
[[6071  445]
 [ 836 1209]]
Average Accuracy: 	 0.8459
Accuracy SD: 		 0.0068


In [65]:
print_score(lr, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.8500
Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.93      0.90      2178
           1       0.72      0.59      0.65       676

   micro avg       0.85      0.85      0.85      2854
   macro avg       0.80      0.76      0.78      2854
weighted avg       0.84      0.85      0.84      2854
 

Confusion Matrix:
[[2026  152]
 [ 276  400]]


In [66]:
svm.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [67]:
print_score(svm, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy Score: 0.8635

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.95      0.91      6516
           1       0.79      0.59      0.67      2045

   micro avg       0.86      0.86      0.86      8561
   macro avg       0.83      0.77      0.79      8561
weighted avg       0.86      0.86      0.86      8561
 

Confusion Matrix:
[[6190  326]
 [ 843 1202]]
Average Accuracy: 	 0.8394
Accuracy SD: 		 0.0117


In [68]:
print_score(svm, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.8476
Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.95      0.90      2178
           1       0.75      0.53      0.62       676

   micro avg       0.85      0.85      0.85      2854
   macro avg       0.81      0.74      0.76      2854
weighted avg       0.84      0.85      0.84      2854
 

Confusion Matrix:
[[2061  117]
 [ 318  358]]


In [69]:
nb.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [70]:
print_score(nb, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy Score: 0.3375

Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.13      0.24      6516
           1       0.26      0.99      0.42      2045

   micro avg       0.34      0.34      0.34      8561
   macro avg       0.62      0.56      0.33      8561
weighted avg       0.80      0.34      0.28      8561
 

Confusion Matrix:
[[ 872 5644]
 [  28 2017]]
Average Accuracy: 	 0.3319
Accuracy SD: 		 0.0158


In [71]:
print_score(nb, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.3217
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.12      0.21      2178
           1       0.26      0.99      0.41       676

   micro avg       0.32      0.32      0.32      2854
   macro avg       0.61      0.55      0.31      2854
weighted avg       0.80      0.32      0.25      2854
 

Confusion Matrix:
[[ 251 1927]
 [   9  667]]
