# Logistic Regression

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import operator
from sklearn.ensemble import RandomForestClassifier

In [2]:
instagram = pd.read_csv(os.path.join('train.csv'))
instagram.head()

Unnamed: 0,profile pic,nums/length username,fullname words,nums/length fullname,name==username,description length,external URL,private,#posts,#followers,#follows,fake
0,1,0.27,0,0.0,0,53,0,0,32,1000,955,0
1,1,0.0,2,0.0,0,44,0,0,286,2740,533,0
2,1,0.1,2,0.0,0,0,0,1,13,159,98,0
3,1,0.0,1,0.0,0,82,0,0,679,414,651,0
4,1,0.0,2,0.0,0,0,0,1,6,151,126,0


In [3]:
 # Assign X (data) and y (target)
X = instagram.drop("fake", axis=1)
y = instagram["fake"]
print(X.shape, y.shape)

(576, 11) (576,)


In [4]:
pd.get_dummies(X)

Unnamed: 0,profile pic,nums/length username,fullname words,nums/length fullname,name==username,description length,external URL,private,#posts,#followers,#follows
0,1,0.27,0,0.00,0,53,0,0,32,1000,955
1,1,0.00,2,0.00,0,44,0,0,286,2740,533
2,1,0.10,2,0.00,0,0,0,1,13,159,98
3,1,0.00,1,0.00,0,82,0,0,679,414,651
4,1,0.00,2,0.00,0,0,0,1,6,151,126
5,1,0.00,4,0.00,0,81,1,0,344,669987,150
6,1,0.00,2,0.00,0,50,0,0,16,122,177
7,1,0.00,2,0.00,0,0,0,0,33,1078,76
8,1,0.00,0,0.00,0,71,0,0,72,1824,2713
9,1,0.00,2,0.00,0,40,1,0,213,12945,813


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [6]:
X_train.head()

Unnamed: 0,profile pic,nums/length username,fullname words,nums/length fullname,name==username,description length,external URL,private,#posts,#followers,#follows
208,1,0.0,0,0.0,0,18,0,1,133,1008,517
114,1,0.0,1,0.0,0,0,0,0,1,331,333
21,1,0.0,1,0.0,0,35,1,1,35,1809,416
201,1,0.0,0,0.0,0,8,0,1,12,173,373
44,1,0.0,4,0.0,0,35,0,0,4494,12397719,8


In [7]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(penalty='l1')
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [8]:
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9050925925925926
Testing Data Score: 0.9375


In [10]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 1 1 1 0 0 0 0]
First 10 Actual labels: [0, 0, 0, 1, 1, 1, 0, 0, 0, 0]


In [11]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,1,1
4,1,1
5,1,1
6,0,0
7,0,0
8,0,0
9,0,0


# Random Forest

In [12]:
 # Create a random forest classifier
results = {}
for i in range(10, 300, 10):
    rf = RandomForestClassifier(n_estimators=i)
    rf = rf.fit(X_train, y_train)
    print(rf.score(X_test, y_test))
    results.update({i:rf.score(X_test, y_test)})

0.9375
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9305555555555556
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9513888888888888
0.9444444444444444
0.9375
0.9305555555555556
0.9513888888888888
0.9305555555555556
0.9444444444444444
0.9513888888888888
0.9375
0.9444444444444444
0.9513888888888888
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444


In [13]:
best_estimator=max(results.items(), key=operator.itemgetter(1))[0]

In [14]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.09847568, 0.13539737, 0.04893857, 0.00909504, 0.00094756,
       0.09993313, 0.00862961, 0.01112036, 0.19063412, 0.31091561,
       0.08591295])

In [15]:
rf = RandomForestClassifier(n_estimators=best_estimator)
rf = rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))


0.9444444444444444


# Random Forest Feature Engineering: Remove Low-Importance Features

In [16]:
 # Assign X (data) and y (target)
X2 = instagram.drop(["fake", "nums/length fullname", "name==username", "external URL"], axis=1)
y2 = instagram["fake"]
print(X2.shape, y2.shape)

(576, 8) (576,)


In [17]:
pd.get_dummies(X2)

Unnamed: 0,profile pic,nums/length username,fullname words,description length,private,#posts,#followers,#follows
0,1,0.27,0,53,0,32,1000,955
1,1,0.00,2,44,0,286,2740,533
2,1,0.10,2,0,1,13,159,98
3,1,0.00,1,82,0,679,414,651
4,1,0.00,2,0,1,6,151,126
5,1,0.00,4,81,0,344,669987,150
6,1,0.00,2,50,0,16,122,177
7,1,0.00,2,0,0,33,1078,76
8,1,0.00,0,71,0,72,1824,2713
9,1,0.00,2,40,0,213,12945,813


In [18]:
from sklearn.model_selection import train_test_split

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=1, stratify=y)

In [19]:
 # Create a random forest classifier
results2 = {}
for i in range(10, 300, 10):
    rf2 = RandomForestClassifier(n_estimators=i)
    rf2 = rf2.fit(X2_train, y2_train)
    print(rf2.score(X2_test, y2_test))
    results2.update({i:rf2.score(X2_test, y2_test)})

0.9166666666666666
0.9513888888888888
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9375
0.9305555555555556
0.9444444444444444
0.9583333333333334
0.9375
0.9375
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9513888888888888
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9375
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9375
0.9444444444444444


In [20]:
best_estimator=max(results2.items(), key=operator.itemgetter(1))[0]

In [21]:
best_estimator

110

In [22]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf2.feature_importances_
importances

array([0.10170158, 0.14541869, 0.05780009, 0.11761301, 0.01163518,
       0.1859129 , 0.29321642, 0.08670213])

In [23]:
rf2 = RandomForestClassifier(n_estimators=best_estimator)
rf2 = rf2.fit(X2_train, y2_train)
print(rf2.score(X2_test, y2_test))

0.9444444444444444


# Random Forest Feature Engineering: Apply Log Function to Minimize Outliers

In [24]:
import numpy as np

In [25]:
instagram_copy=instagram.copy()

In [26]:
instagram_copy['#followers']=instagram['#followers'].map(lambda x:np.log(x+1))

In [27]:
instagram.head()

Unnamed: 0,profile pic,nums/length username,fullname words,nums/length fullname,name==username,description length,external URL,private,#posts,#followers,#follows,fake
0,1,0.27,0,0.0,0,53,0,0,32,1000,955,0
1,1,0.0,2,0.0,0,44,0,0,286,2740,533,0
2,1,0.1,2,0.0,0,0,0,1,13,159,98,0
3,1,0.0,1,0.0,0,82,0,0,679,414,651,0
4,1,0.0,2,0.0,0,0,0,1,6,151,126,0


In [28]:
 # Assign X (data) and y (target)
X3 = instagram_copy.drop(["fake", "nums/length fullname", "name==username", "external URL"], axis=1)
y3 = instagram_copy["fake"]
print(X3.shape, y3.shape)

(576, 8) (576,)


In [29]:
pd.get_dummies(X3)

Unnamed: 0,profile pic,nums/length username,fullname words,description length,private,#posts,#followers,#follows
0,1,0.27,0,53,0,32,6.908755,955
1,1,0.00,2,44,0,286,7.916078,533
2,1,0.10,2,0,1,13,5.075174,98
3,1,0.00,1,82,0,679,6.028279,651
4,1,0.00,2,0,1,6,5.023881,126
5,1,0.00,4,81,0,344,13.415015,150
6,1,0.00,2,50,0,16,4.812184,177
7,1,0.00,2,0,0,33,6.983790,76
8,1,0.00,0,71,0,72,7.509335,2713
9,1,0.00,2,40,0,213,9.468542,813


In [30]:
from sklearn.model_selection import train_test_split

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state=1, stratify=y)

In [31]:
 # Create a random forest classifier
results3 = {}
for i in range(10, 300, 10):
    rf3 = RandomForestClassifier(n_estimators=i)
    rf3 = rf3.fit(X3_train, y3_train)
    print(rf3.score(X3_test, y3_test))
    results3.update({i:rf3.score(X3_test, y3_test)})

0.9305555555555556
0.9513888888888888
0.9236111111111112
0.9375
0.9444444444444444
0.9513888888888888
0.9305555555555556
0.9375
0.9513888888888888
0.9375
0.9375
0.9444444444444444
0.9444444444444444
0.9375
0.9444444444444444
0.9375
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9375
0.9444444444444444
0.9305555555555556
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444
0.9444444444444444


In [32]:
instagram_copy.describe()

Unnamed: 0,profile pic,nums/length username,fullname words,nums/length fullname,name==username,description length,external URL,private,#posts,#followers,#follows,fake
count,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0
mean,0.701389,0.163837,1.460069,0.036094,0.034722,22.623264,0.116319,0.381944,107.489583,5.191867,508.381944,0.5
std,0.458047,0.214096,1.052601,0.125121,0.183234,37.702987,0.320886,0.486285,402.034431,2.569626,917.981239,0.500435
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.688879,57.5,0.0
50%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9.0,5.02058,229.5,0.5
75%,1.0,0.31,2.0,0.0,0.0,34.0,0.0,1.0,81.5,6.575064,589.5,1.0
max,1.0,0.92,12.0,1.0,1.0,150.0,1.0,1.0,7389.0,16.545879,7500.0,1.0


In [33]:
best_estimator=max(results3.items(), key=operator.itemgetter(1))[0]

In [34]:
best_estimator

20

In [35]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf3.feature_importances_
importances

array([0.11617263, 0.15559134, 0.05155888, 0.1004368 , 0.01149469,
       0.18400992, 0.28035696, 0.10037876])

In [36]:
rf3 = RandomForestClassifier(n_estimators=best_estimator)
rf3 = rf3.fit(X3_train, y3_train)
print(rf3.score(X3_test, y3_test))

0.9444444444444444


In [37]:
# Necessary imports: 
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

In [38]:
 scores = cross_val_score(rf3, X, y, cv=10)
print(f'Cross-validated scores:, {scores}')

Cross-validated scores:, [0.96551724 0.94827586 0.84482759 0.9137931  0.9137931  0.82758621
 0.96551724 0.98275862 0.85714286 0.96428571]


In [39]:
from scipy.stats import sem, t
from scipy import mean

confidence = 0.95

data = scores

n = len(data)
m = mean(data)
std_err = sem(data)
h = std_err * t.ppf((1 + confidence) / 2, n - 1)

start = m - h

end = m + h

print(f'We are 95% confident that the true mean of the population lies with in {start, end}')

We are 95% confident that the true mean of the population lies with in (0.8777182432439322, 0.9589812641452303)
