In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report,confusion_matrix
%matplotlib inline

In [3]:
def loaddata():
    data = pd.read_csv('dataset_Facebook.csv',sep=';',error_bad_lines=False)
    return data

In [3]:
def printaccuracy(y_test,predict,model):
    print(model," report")
    print("-------------------------------------")
    print(" ")
    print(confusion_matrix(y_test,predict))
    print(classification_report(y_test,predict))
    print(" ")
    print("-------------------------------------")
    print(" ")

In [4]:
def normalizedata(X):
    SS = StandardScaler()
    X = SS.fit_transform(X)
    print("Normalization done")
    

In [5]:
def PC(components,x):
    cols = []
    pca = PCA(n_components=components)
    pc = pca.fit_transform(x)
    for i in range(components):
        cols.append('pc'+str(i))
    pc_data = pd.DataFrame(data = pc, columns = cols)
    return pc_data

In [6]:
def removeoutliers(data,inplace=False):
    prev_rows = len(data)
    data_copy = data.copy()
    z_score = np.abs(stats.zscore(data_copy))
    data_copy = data_copy[(z_score < 3).all(axis=1)]
    if inplace:
        data=data_copy
    #print("Before removing outliers , rows - ", prev_rows)
    #print("After removing outliers , rows -", len(data_copy))
    #print("Number of records deleted - ", (prev_rows - len(data_copy))#)

In [7]:
def validatecols(data):
    if len(data.columns) == len(col):
        return True
    else:
        return False

In [8]:
def validatedatatypes(trained, newdata):
    for i in range(trained.columns):
        if trained[trained.columns[i]] != newdata[newdata.columns[i]]:
            return False
    return True    

In [9]:
def preprocess():
    data.drop('ID Number',axis=1,inplace=True)
    data['Diagnosis'] = pd.Categorical(data['Diagnosis']).codes
    removeoutliers(data,inplace=True)
    X = data.drop('Diagnosis',axis=1)
    X_copy = X.copy()
    y= data['Diagnosis']
    normalizedata(X)
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
    return X_train,X_test,y_train,y_test,X_copy,y

In [10]:
def k_vs_error_graph():
    knn_error = []
    for i in range(2,50):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train,y_train)
        knn_predict= knn.predict(X_test)
        knn_error.append(np.mean(y_test!=knn_predict))
    plt.plot(range(2,50),knn_error)
    plt.xlabel("K value")
    plt.ylabel("Error")

In [11]:
def logisticregression():
    lr = LogisticRegression(solver='lbfgs',max_iter=10000,random_state=0)
    lr.fit(X_train,y_train)
    lr_predict = lr.predict(X_test)
    printaccuracy(y_test,lr_predict,"Logistic Regression")
    return f1_score(y_test,lr_predict)

In [12]:
def KNN():
    neighbors={'n_neighbors':np.array(range(2,50))}
    knn_grid=GridSearchCV(KNeighborsClassifier(),neighbors,verbose=False,refit=True,cv=3)
    knn_grid.fit(X_train,y_train)
    #knn_grid.best_params_
    knn_predict = knn_grid.predict(X_test)
    printaccuracy(y_test,knn_predict,"KNN")
    return f1_score(y_test,knn_predict)

In [13]:
def SVM():
    svm = SVC(kernel='rbf',random_state=0)
    params = { 'C' : np.logspace(0, 3, 4), 'gamma' : np.logspace(-2, 1, 4)}
    svm_grid = GridSearchCV(svm, params,cv=3,verbose=False,return_train_score=True)
    svm_X = X.copy()
    svm_X = PC(2,svm_X)
    svm_X_train,svm_X_test,svm_y_train,svm_y_test=train_test_split(svm_X,y,test_size=0.3)
    svm_grid.fit(svm_X_train,svm_y_train)
    svm_predict = svm_grid.predict(svm_X_test)
    printaccuracy(svm_y_test,svm_predict,"SVM")
    return f1_score(svm_y_test,svm_predictict)

In [14]:
def DecisionTree():
    dt = DecisionTreeClassifier(random_state=0)
    dt.fit(X_train,y_train)
    dt_predict = dt.predict(X_test)
    printaccuracy(y_test,dt_predict,"Decision Tree")
    return f1_score(y_test,dt_predict)

In [15]:
def RandomForest():
    rf = RandomForestClassifier(random_state=0)
    params = { 'n_estimators' : np.arange(10,100,10), 'max_depth' : np.arange(5,50,5)}
    rf_grid = GridSearchCV(rf, params, verbose=False, cv=3)
    rf_grid.fit(X_train,y_train)
    rf_predict = rf_grid.predict(X_test)
    printaccuracy(y_test,rf_predict,"Random Forest")
    rf
    return f1_score(y_test,rf_predict)

In [16]:
def Adaboost():
    ab = AdaBoostClassifier(random_state=0)
    params = { 'n_estimators' : np.arange(10,100,10)}
    ab_grid = GridSearchCV(ab, params, verbose=False, cv=3)
    ab_grid.fit(X_train,y_train)
    ab_predict = ab_grid.predict(X_test)
    printaccuracy(y_test,ab_predict,"Adaboost")
    return f1_score(y_test,ab_predict)

In [17]:
def GaussionNB():
    gnb = GaussianNB()
    gnb.fit(X_train,y_train)
    gnb_predict = gnb.predict(X_test)
    printaccuracy(y_test,gnb_predict,"GaussionNB")
    return f1_score(y_test,gnb_predict)

In [18]:
def NueralNetwork():
    nn = MLPClassifier(solver='sgd',random_state=0)
    params = {
    'hidden_layer_sizes': np.arange(50,150,20),
    'learning_rate': ['constant','adaptive'],
    'max_iter': np.arange(200,300,50)
    }
    #'hidden_layer_sizes': [(100,50), (50,20), (20,10)],
    #'hidden_layer_sizes': np.arange(10,100,20)
    # 'activation': ['tanh', 'relu'],
    # 'alpha': 10.0 ** -np.arange(1, 5),
    #'solver': ['sgd', 'adam'],
    nn_grid = GridSearchCV(nn, params, cv=3,verbose=False)
    nn_grid.fit(X_train,y_train)
    nn_predict = nn_grid.predict(X_test)
    printaccuracy(y_test,nn_predict,"Nueral Network")
    return f1_score(y_test,nn_predict)

In [19]:
def EvaluateModels(f1scores):
    models=['Logistic Regression',
            'K Nearest Neighbours',
            'Support Vector Machine',
            'Decision Tree',
            'Random Forest',
            'AdaBoost',
            'Guassian Naive Bayes',
            'Nueral Network'
           ]

    print("        Model Results    ")
    print(" --------------------------- ")
    for i in range(len(f1scores)):
        print(models[i]," : f1 score - ",f1scores[i])
    print(" -----------------------------")
    print("Best model")
    idx = f1scores.index(np.max(f1scores))
    print(models[idx] ," : f1 score - ", np.max(f1scores))
    
    
    
    
    

In [20]:
def trainmodels():
    f1scores=[]
    f1scores.append(logisticregression())
    f1scores.append(KNN())
    f1scores.append(SVM())
    f1scores.append(DecisionTree())
    f1scores.append(RandomForest())
    f1scores.append(Adaboost())
    f1scores.append(GaussionNB())
    f1scores.append(NueralNetwork())
    return f1scores

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 19 columns):
Page total likes                                                       500 non-null int64
Type                                                                   500 non-null object
Category                                                               500 non-null int64
Post Month                                                             500 non-null int64
Post Weekday                                                           500 non-null int64
Post Hour                                                              500 non-null int64
Paid                                                                   499 non-null float64
Lifetime Post Total Reach                                              500 non-null int64
Lifetime Post Total Impressions                                        500 non-null int64
Lifetime Engaged Users                                                 500 non-nul

In [4]:
data = loaddata()
#X_train,X_test,y_train,y_test,X,y = preprocess()
#f1scores = trainmodels()
#EvaluateModels(f1scores)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 19 columns):
Page total likes                                                       500 non-null int64
Type                                                                   500 non-null object
Category                                                               500 non-null int64
Post Month                                                             500 non-null int64
Post Weekday                                                           500 non-null int64
Post Hour                                                              500 non-null int64
Paid                                                                   499 non-null float64
Lifetime Post Total Reach                                              500 non-null int64
Lifetime Post Total Impressions                                        500 non-null int64
Lifetime Engaged Users                                                 500 non-nul

In [6]:
data.head()

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
0,139441,Photo,2,12,4,3,0.0,2752,5091,178,109,159,3078,1640,119,4,79.0,17.0,100
1,139441,Status,2,12,3,10,0.0,10460,19057,1457,1361,1674,11710,6112,1108,5,130.0,29.0,164
2,139441,Photo,3,12,3,3,0.0,2413,4373,177,113,154,2812,1503,132,0,66.0,14.0,80
3,139441,Photo,2,12,2,10,1.0,50128,87991,2211,790,1119,61027,32048,1386,58,1572.0,147.0,1777
4,139441,Photo,2,12,2,3,0.0,7244,13594,671,410,580,6228,3200,396,19,325.0,49.0,393


In [21]:
data[['Page total likes','Type','Category','Post Month','Post Weekday','Post Hour','Paid']].head()

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid
0,139441,Photo,2,12,4,3,0.0
1,139441,Status,2,12,3,10,0.0
2,139441,Photo,3,12,3,3,0.0
3,139441,Photo,2,12,2,10,1.0
4,139441,Photo,2,12,2,3,0.0


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 19 columns):
Page total likes                                                       500 non-null int64
Type                                                                   500 non-null object
Category                                                               500 non-null int64
Post Month                                                             500 non-null int64
Post Weekday                                                           500 non-null int64
Post Hour                                                              500 non-null int64
Paid                                                                   499 non-null float64
Lifetime Post Total Reach                                              500 non-null int64
Lifetime Post Total Impressions                                        500 non-null int64
Lifetime Engaged Users                                                 500 non-nul

In [26]:
data = data.dropna()

In [28]:
data.describe()

Unnamed: 0,Page total likes,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
count,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0
mean,123173.268687,1.886869,7.028283,4.133333,7.844444,0.280808,14028.10101,29856.98,926.830303,804.155556,1425.921212,16916.28,6641.355556,614.135354,7.557576,179.145455,27.264646,213.967677
std,16203.818031,0.853268,3.304274,2.030735,4.385064,0.449849,22821.050008,77142.91,987.713267,885.18444,2007.66346,60074.02,7700.266455,614.346297,21.274384,324.412161,42.656388,381.677449
min,81370.0,1.0,1.0,1.0,1.0,0.0,238.0,570.0,9.0,9.0,9.0,567.0,236.0,9.0,0.0,0.0,0.0,0.0
25%,112324.0,1.0,4.0,2.0,3.0,0.0,3331.0,5798.0,399.0,335.0,512.5,4073.5,2213.0,297.5,1.0,57.0,10.0,72.0
50%,129600.0,2.0,7.0,4.0,9.0,0.0,5290.0,9084.0,630.0,555.0,861.0,6282.0,3478.0,416.0,3.0,101.0,19.0,125.0
75%,136393.0,3.0,10.0,6.0,11.0,1.0,13248.0,22503.0,1062.0,969.0,1479.0,15143.0,8018.0,658.5,7.0,188.0,32.5,231.0
max,139441.0,3.0,12.0,7.0,23.0,1.0,180480.0,1110282.0,11452.0,11328.0,19779.0,1107833.0,51456.0,4376.0,372.0,5172.0,790.0,6334.0


In [29]:
data = pd.read_csv('dataset_Facebook.csv',sep=';',error_bad_lines=False)
data = data.dropna()
data['Type'] = pd.Categorical(data['Type']).codes
X = data[['Page total likes','Type','Category','Post Month','Post Weekday','Post Hour','Paid']]
y = data['Lifetime Post Total Reach']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
scalar = StandardScaler().fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

In [32]:
t = pd.DataFrame(X_train)

In [33]:
t.describe()

Unnamed: 0,0,1,2,3,4,5,6
count,346.0,346.0,346.0,346.0,346.0,346.0,346.0
mean,-3.285747e-16,2.772349e-16,-3.850484e-17,-1.026796e-16,-1.694213e-16,-3.080388e-17,1.026796e-17
std,1.001448,1.001448,1.001448,1.001448,1.001448,1.001448,1.001448
min,-2.503394,-2.603974,-0.9889211,-1.784111,-1.517113,-1.566311,-0.6196773
25%,-0.7952507,-0.1219492,-0.9889211,-0.8924849,-1.029268,-1.101323,-0.6196773
50%,0.4076928,-0.1219492,0.1869095,-0.0008589845,-0.05357834,0.2936414,-0.6196773
75%,0.8289795,-0.1219492,1.36274,0.8907669,0.9221114,0.7586295,1.613743
max,1.00168,4.842101,1.36274,1.485184,1.409956,3.316064,1.613743


In [34]:
t1 = pd.DataFrame(y_train)

In [35]:
t1.head()

Unnamed: 0,Lifetime Post Total Reach
405,4390
285,8628
30,4940
489,4800
137,1080
