In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import scipy.stats as stats
import os
import pickle
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import sklearn.model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report 


In [4]:
account_t=pd.read_csv('../accounts_data_trim.csv')
account_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64243 entries, 0 to 64242
Data columns (total 25 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       64243 non-null  int64  
 1   Unnamed: 0.1     64243 non-null  int64  
 2   posts            64243 non-null  int64  
 3   flw              64243 non-null  int64  
 4   flg              64243 non-null  int64  
 5   pic              64243 non-null  int64  
 6   link             64243 non-null  int64  
 7   caption_len_avg  64243 non-null  int64  
 8   cap_zero_per     64243 non-null  float64
 9   no_image_per     64243 non-null  float64
 10  likes_rate       64243 non-null  float64
 11  comment_rate     64243 non-null  float64
 12  loc_tag          64243 non-null  float64
 13  hash_count       64243 non-null  float64
 14  cosine_sim_avg   64243 non-null  float64
 15  post_interval    64243 non-null  float64
 16  class            64243 non-null  object 
 17  posts_a     

In [5]:
account_t.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'posts', 'flw', 'flg', 'pic', 'link',
       'caption_len_avg', 'cap_zero_per', 'no_image_per', 'likes_rate',
       'comment_rate', 'loc_tag', 'hash_count', 'cosine_sim_avg',
       'post_interval', 'class', 'posts_a', 'flw_a', 'flg_a', 'likes_a',
       'hash_a', 'cap_avg_a', 'comment_r_a', 'post_interval_a'],
      dtype='object')

In [6]:
#Create dummy or indicator features from categorical features
#Split into testing and training datasets 

#dfo=account_t.select_dtypes(include=['object']) # to select object type columns
dfo=account_t[['pic','link']] # all data was numeric 
df_droped=account_t.drop(['Unnamed: 0', 'Unnamed: 0.1','pic','link', 'posts', 'flw', 'flg', 'likes_rate','hash_count', 'caption_len_avg', 'comment_rate', 'post_interval','cosine_sim_avg' ], axis=1)
df = pd.concat([df_droped, pd.get_dummies(dfo)], axis=1)
df.head()



Unnamed: 0,cap_zero_per,no_image_per,loc_tag,class,posts_a,flw_a,flg_a,likes_a,hash_a,cap_avg_a,comment_r_a,post_interval_a,pic,link
0,0.0,0.0,0.0,f,44.0,48.0,325.0,0.0,0.0,12.0,0.0,0.094985,1,0
1,0.0,1.0,0.0,f,10.0,66.0,321.0,14.39,1.5,213.0,1.97,230.412857,1,0
2,0.0,1.0,0.0,f,33.0,970.0,308.0,10.1,2.5,436.0,0.3,43.569939,1,1
3,1.0,0.0,0.0,f,70.0,86.0,360.0,0.78,0.0,0.0,0.06,5.859799,1,0
4,0.0,0.0,0.667,f,3.0,21.0,285.0,14.29,0.0,93.0,0.0,0.126019,1,0


In [7]:
# Check for missing data again
df.isnull().any()
#if there are missing values we'll use to fill in any missing values
#X_d_median = X_train.median()

cap_zero_per       False
no_image_per       False
loc_tag            False
class              False
posts_a            False
flw_a              False
flg_a              False
likes_a            False
hash_a             False
cap_avg_a          False
comment_r_a        False
post_interval_a    False
pic                False
link               False
dtype: bool

In [50]:
# split the dataset
X=df.drop(['class'], axis=1)
y=df['class'].map({'f':1,'r':0 })

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=24)

In [9]:
# Standardize the magnitude of numeric features using a scaler- since some of the features are percintage 0 to 1 
#and some are big numbers. 

scaler=StandardScaler()
scaler.fit(X_train)
X_train_s=scaler.transform(X_train)
X_test_s=scaler.transform(X_test)

In [10]:
# Question 1- My categorical features were coded as int data type, is it good for the model?
            # do we need them to be int or object for the model to work ("correctly understanding the data")
# Question 2- There is no need to scale the categorical or the dummy variables but is it a mistake 
              #to transform the two i have (0/1) in the scaler

In [11]:
c_= [0.001,0.01,0.1,1,10,100]
accuracy=[]
precision=[]

for i in c_:

    Log_r = LogisticRegression( C = i ,random_state = 42)
    Log_r.fit(X_train_s, y_train)
    
    # Predict using model
    y_pred = Log_r.predict(X_test_s)
    ac=accuracy_score(y_test, y_pred)
    accuracy.append(ac)
    pr=precision_score(y_test, y_pred)
    precision.append(pr)


In [12]:
scores={"c_": c_, 'accuracy':accuracy, "precision":precision}
scores=pd.DataFrame(scores)
scores
#0ne is the best c hyperparameter.
# Precition score is important in this case as we want to detect the True positive and minimaize the false positives

Unnamed: 0,c_,accuracy,precision
0,0.001,0.807456,0.807061
1,0.01,0.808779,0.808052
2,0.1,0.808857,0.808082
3,1.0,0.809168,0.808491
4,10.0,0.809168,0.808491
5,100.0,0.809168,0.808491


In [13]:
# Now I will use the model with hyperparameter c=1 and cross validate my sample.
from sklearn.model_selection import cross_val_score
Log_r = LogisticRegression( C = 1 ,random_state = 42)
Log_r.fit(X_train_s, y_train)
scoring=['precision_macro','f1_macro','roc_auc']
for scor in scoring:
    cv_scores_train= cross_val_score(Log_r ,X_train_s,y_train,cv=5,scoring=scor)
    cv_scores_test= cross_val_score(Log_r,X_test_s, y_test,cv=5,scoring=scor)
    cv_scores_log_test= cv_scores_test.mean()
    cv_scores_log_train= cv_scores_train.mean()
    print(cv_scores_test)
    print("cv mean train score", cv_scores_log_train)
    print("cv mean test score", cv_scores_log_test)


[0.81726036 0.80052646 0.81409121 0.8015778  0.80461256]
cv mean train score 0.8051796406567366
cv mean test score 0.8076136784568092
[0.81710955 0.80035208 0.81400226 0.80155594 0.80458467]
cv mean train score 0.8051605995077722
cv mean test score 0.8075208990306818
[0.88713543 0.86394999 0.8794774  0.86983961 0.87478787]
cv mean train score 0.8751093221517813
cv mean test score 0.8750380603861618


In [14]:
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import f1_score
#Log_r.fit(X_train_s, y_train)
#cnf_matrix= confusion_matrix(y_test,y_pred, normalize='true')
#print(cnf_matrix)
#pr_1=precision_score(y_test, y_pred)
#f1_1=f1_score(y_test, y_pred)
#print(pr_1, f1_1)
# Precition score is important in this case as we want to detect the True positive and minimaize the false positives

In [15]:
# Next I will try the K nearest neighbor

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
#List Hyperparameters that we want to tune:
leaf_size = list(range(1,50))
n_neighbors = list(range(1,10))
p=[1,2]

hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

knn = KNeighborsClassifier()
#Use GridSearch
clf = RandomizedSearchCV(knn, hyperparameters, cv=10)
best_model = clf.fit(X_train_s, y_train)

#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])


Best leaf_size: 10
Best p: 1
Best n_neighbors: 9


In [25]:
from sklearn.metrics import confusion_matrix
knn = KNeighborsClassifier(p=1, leaf_size=10 ,n_neighbors=9)
knn.fit(X_train_s,y_train)

# Predict using model:

y_pred_knn=knn.predict(X_test_s)

#Confusion matrix:

matrix = confusion_matrix(y_test, y_pred_knn)
print(matrix)
pr_knn=precision_score(y_test, y_pred_knn)
print(pr_knn)

[[5893  551]
 [1326 5079]]
0.9021314387211368


In [21]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report 

print(roc_auc_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

# This provides a lower ROC_AUC score, but better accuracy and precition. Which is a better model for our goal.

0.8537341709587336
              precision    recall  f1-score   support

           0       0.82      0.91      0.86      6444
           1       0.90      0.79      0.84      6405

    accuracy                           0.85     12849
   macro avg       0.86      0.85      0.85     12849
weighted avg       0.86      0.85      0.85     12849



In [26]:
cv_scores_train= cross_val_score(knn,X_train_s,y_train,cv=5,scoring='precision_macro')
cv_scores_test= cross_val_score(knn,X_test_s,y_test,cv=5,scoring='precision_macro')
print(cv_scores_test)
cv_scores_knn_test= cv_scores_test.mean()
cv_scores_knn_train= cv_scores_train.mean()
cv_scores_std_knn= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_knn_test))
print ('Mean cross validation train score: ' +str(cv_scores_knn_train))
print ('Standard deviation in cv test scores: ' +str(cv_scores_std_knn))
# After cross validation the prediction score wasn't as high as in the initial sample, 
# but still higher than in logistic regression.

[0.85574407 0.84599786 0.84590252 0.86544197 0.84672213]
Mean cross validation test score: 0.851961712317042
Mean cross validation train score: 0.8561709186229679
Standard deviation in cv test scores: 0.007691009539526452


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [32]:
#RF no max depth :
Rf = RandomForestClassifier(bootstrap=True)
Rf.fit(X_train_s, y_train)
    #Predict using the model:
y_predict_Rf = Rf.predict(X_test_s)
    #Confusion matrix:
c_matrix = confusion_matrix(y_test, y_predict_Rf)
ra_a=roc_auc_score(y_test, y_predict_Rf)
precision_3=precision_score(y_test, y_predict_Rf)
print(c_matrix)
print("roc_auc "+ str(ra_a))
print("precision "+str(precision_3)

# This model is overfitting and I will try some parameter tuning.

[[6188  256]
 [1118 5287]] 0.8928609951780573 0.9538156233086776


In [33]:
roc_auc=[]
precitions=[]
max_d=[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]
for max_depth in range(2,20):
    rf = RandomForestClassifier(bootstrap=True, max_depth=max_depth)
    rf.fit(X_train_s, y_train)
    #Predict using the model:
    y_predict_rf = rf.predict(X_test_s)
    #Confusion matrix:
    cnf_matrix = confusion_matrix(y_test, y_predict_rf)
    ra=roc_auc_score(y_test, y_predict_rf)
    pr_3=precision_score(y_test, y_predict_rf)
    roc_auc.append(ra)
    precitions.append(pr_3)
    #print(roc_auc_score(y_test, y_predict_rf))
    #print(classification_report(y_test, y_predict_rf)) 

In [35]:

scores_rf={"max_depth": max_d, 'roc_auc':roc_auc, "precision":precitions}
scores_=pd.DataFrame(scores_rf)
print(scores_)
scores_best_md = max(precitions)
best_roc=max(roc_auc)
print("best precition " +str( scores_best_md), "  best roc_auc "+ str(best_roc))

#The roc_auc reconds 6 trees, the precition 14 with higher score

    max_depth   roc_auc  precision
0           2  0.855777   0.950732
1           3  0.858160   0.936097
2           4  0.874567   0.961872
3           5  0.875493   0.966079
4           6  0.877443   0.966783
5           7  0.879385   0.970378
6           8  0.883046   0.973735
7           9  0.884531   0.973103
8          10  0.886020   0.971037
9          11  0.888521   0.970144
10         12  0.889930   0.968832
11         13  0.890792   0.967844
12         14  0.891266   0.965793
13         15  0.892838   0.961983
14         16  0.894013   0.960738
15         17  0.892379   0.958569
16         18  0.892931   0.956459
17         19  0.891843   0.954859
best precition 0.973735032831209   best roc_auc 0.8940125847328888


In [36]:
#I will try the best max_depth with 6 and take aloot at the classification report:
rf_1 = RandomForestClassifier(max_depth=6)
rf_1.fit(X_train_s, y_train)
#Predict using the model:

y_predict_rf1 = rf_1.predict(X_test_s)
#Confusion matrix:
cnf_matrix_1 = confusion_matrix(y_test, y_predict_rf1)
ra_1=roc_auc_score(y_test, y_predict_rf1)
print(classification_report(y_test, y_predict_rf1))
print(cnf_matrix_1)
print(ra_1)

              precision    recall  f1-score   support

           0       0.82      0.98      0.89      6444
           1       0.97      0.78      0.86      6405

    accuracy                           0.88     12849
   macro avg       0.89      0.88      0.88     12849
weighted avg       0.89      0.88      0.88     12849

[[6286  158]
 [1411 4994]]
0.8775922122061879


In [37]:
# Now crossvalidate again:
cv_scores_train= cross_val_score(rf_1,X_train_s,y_train,cv=10,scoring='precision_macro')
cv_scores_test= cross_val_score(rf_1,X_test_s,y_test,cv=10,scoring='precision_macro')
print(cv_scores_test)
cv_scores_RF_test= cv_scores_test.mean()
cv_scores_RF_train= cv_scores_train.mean()
cv_scores_std_RF= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_RF_test))
print ('Mean cross validation train score: ' +str(cv_scores_RF_train))
print ('Standard deviation in cv test scores: ' +str(cv_scores_std_RF))

# We have a very good precition, accuracy and roc_auc using the RF classifier, but the model might be overfitting.

[0.88829634 0.89977496 0.88345903 0.89410592 0.88342732 0.88956149
 0.89957632 0.88937472 0.89548182 0.88706115]
Mean cross validation test score: 0.8910119061269945
Mean cross validation train score: 0.892926061684121
Standard deviation in cv test scores: 0.005674787793995626


In [None]:
# This shows good prediction on an unseen test set. 
# Just to make sure I will try with more hyperparameter tuning and see if the moel performence
#on the test score is still good.

In [42]:
rf_entropy = RandomForestClassifier(bootstrap=True, max_depth=6, criterion='entropy')
rf_entropy.fit(X_train_s, y_train)
#Predict using the model:
y_predict_entropy = rf_entropy.predict(X_test_s)
#Confusion matrix:
cnf_matrix_1 = confusion_matrix(y_test, y_predict_entropy)
ra_1=roc_auc_score(y_test, y_predict_entropy)
print(classification_report(y_test, y_predict_entropy))
print(cnf_matrix_1)
print(ra_1)

              precision    recall  f1-score   support

           0       0.81      0.98      0.89      6444
           1       0.97      0.78      0.86      6405

    accuracy                           0.88     12849
   macro avg       0.89      0.88      0.88     12849
weighted avg       0.89      0.88      0.88     12849

[[6310  134]
 [1437 4968]]
0.8774247452743653


In [46]:
n_estimators = list(range(1,50))
max_depth = list(range(1,20))
min_samples_leaf= list(range(1,10))

hyperparameters = dict(n_estimators=n_estimators, max_depth= max_depth, min_samples_leaf=min_samples_leaf)

#Use GridSearch
clf = RandomizedSearchCV(Rf, hyperparameters, cv=10)
best_model = clf.fit(X_train_s, y_train)

#Print The value of best Hyperparameters
print('Best n_estimators:', best_model.best_estimator_.get_params()['n_estimators'])
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best min_samples_leaf:', best_model.best_estimator_.get_params()['min_samples_leaf'])

Best n_estimators: 23
Best max_depth: 15
Best min_samples_leaf: 8


In [48]:
# To make sure I will crossvalidate the sample with the best Random Forest model.
RF=RandomForestClassifier(max_depth=15, n_estimators=23, min_samples_leaf=8)
RF.fit(X_train_s, y_train)
y_pred_RF=RF.predict(X_test_s)
cnf_matrix_RF = confusion_matrix(y_test, y_pred_RF)
ra_RF=roc_auc_score(y_test, y_pred_RF)
print(classification_report(y_test, y_pred_RF))
print(cnf_matrix_RF)
print(ra_RF)

              precision    recall  f1-score   support

           0       0.84      0.97      0.90      6444
           1       0.96      0.82      0.88      6405

    accuracy                           0.89     12849
   macro avg       0.90      0.89      0.89     12849
weighted avg       0.90      0.89      0.89     12849

[[6228  216]
 [1168 5237]]
0.8920614568750846
