In [2]:
#SVM Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics

#Logistic Regression Imports
from sklearn import linear_model

#KNN Imports
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split 

#model functions
def train_svm(df, df_xcols, df_ycol, kernel_type):
    x_svm = df_xcols
    y_svm = df[df_ycol]
    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(x_svm, y_svm, test_size=0.3) # 70% training and 30% test

    clf = SVC(kernel=kernel_type)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Model Accuracy: how often is the classifier correct?
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    
    #Model Precision
    print("Precision:",metrics.precision_score(y_test, y_pred))

    #Model Recall
    print("Recall:",metrics.recall_score(y_test, y_pred))
    
    #Model F1 Score
    print("F1 Score: ", metrics.f1_score(y_test, y_pred))
    
    return clf
    
def train_logr(df, feature_cols, df_ycol):
    x_logr = feature_cols
    y_logr = df[df_ycol]
    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(x_logr, y_logr, test_size=0.3) # 70% training and 30% test

    logr = linear_model.LogisticRegression(max_iter = 1000)#increased max iter bc when...
    #passing in all features, algorithm doesn't converge so have to adjust max_iter
    logr.fit(X_train, y_train.values.ravel())
    y_pred = logr.predict(X_test)
    
    # Model Accuracy: how often is the classifier correct?
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    
    #Model Precision
    print("Precision:",metrics.precision_score(y_test, y_pred))

    #Model Recall
    print("Recall:",metrics.recall_score(y_test, y_pred))
    
    #Model F1 Score
    print("F1 Score: ", metrics.f1_score(y_test, y_pred))
    
    return logr

def train_KNN(df, feature_cols, df_ycol, neighbors):

    # Create feature and target arrays 
    x_KNN = feature_cols
    y_KNN = df[df_ycol]

    # Split into training and test set 
    X_train, X_test, y_train, y_test = train_test_split(x_KNN, y_KNN, test_size = 0.3) 

    knn = KNeighborsClassifier(n_neighbors=neighbors) 

    knn.fit(X_train, y_train) 

    # Predict on dataset which model has not seen before 
    y_pred = knn.predict(X_test)
    
    # Model Accuracy: how often is the classifier correct?
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    
    #Model Precision
    print("Precision:",metrics.precision_score(y_test, y_pred))

    #Model Recall
    print("Recall:",metrics.recall_score(y_test, y_pred))
    
    #Model F1 Score
    print("F1 Score: ", metrics.f1_score(y_test, y_pred))
    
    return knn


In [3]:
#Import csv's
instadf = pd.read_csv("insta_data.csv")
ytdf = pd.read_csv("youtube_data.csv")

In [None]:
#Training SVM on specific TM (Topic Modeling) Features

In [33]:
LdaOnly = instadf.drop(columns=["Clickbait", "Bert Feature", "LSA Feature"])
instamodel = train_svm(instadf, LdaOnly, "Clickbait", "rbf")

Accuracy: 0.5491905354919053
Precision: 0.5714285714285714
Recall: 0.03287671232876712
F1 Score:  0.06217616580310881


In [34]:
BertOnly = instadf.drop(columns=["Clickbait", "LDA Feature", "LSA Feature"])
instamodel = train_svm(instadf, BertOnly, "Clickbait", "rbf")

Accuracy: 0.5516811955168119
Precision: 0.4883720930232558
Recall: 0.02920723226703755
F1 Score:  0.05511811023622047


In [37]:
LSAOnly = instadf.drop(columns=["Clickbait", "LDA Feature", "Bert Feature"])
instamodel = train_svm(instadf, LSAOnly, "Clickbait", "rbf")

Accuracy: 0.5610211706102117
Precision: 0.6
Recall: 0.037815126050420166
F1 Score:  0.07114624505928854


In [None]:
#Training Logistic Regression Models on specific TM Features

In [18]:
instamodel = train_logr(instadf, instadf.drop(columns=["Clickbait"]), "Clickbait")

Accuracy: 0.7291407222914073
Precision: 0.6961038961038961
Recall: 0.7272727272727273
F1 Score:  0.7113470471134705


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
LDAOnly = instadf.drop(columns=["Clickbait", "LDA Feature", "LSA Feature"])
instamodel = train_logr(instadf, LDAOnly, "Clickbait")

Accuracy: 0.7733499377334994
Precision: 0.7064102564102565
Recall: 0.8032069970845481
F1 Score:  0.7517053206002728


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
BertOnly = instadf.drop(columns=["Clickbait", "LDA Feature", "LSA Feature"])
instamodel = train_logr(instadf, BertOnly, "Clickbait")

Accuracy: 0.7627646326276464
Precision: 0.7208387942332897
Recall: 0.766016713091922
F1 Score:  0.7427413909520594


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
LSAOnly = instadf.drop(columns=["Clickbait", "LDA Feature", "Bert Feature"])
instamodel = train_logr(instadf, LSAOnly, "Clickbait")

Accuracy: 0.7652552926525529
Precision: 0.7259923175416133
Recall: 0.7767123287671233
F1 Score:  0.7504963600264726


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#Traning KNN on specific TM Features

In [52]:
instamodel = train_KNN(instadf, instadf.drop(columns=["Clickbait"]), "Clickbait", 7)#Normal

Accuracy: 0.5560398505603985
Precision: 0.487062404870624
Recall: 0.45977011494252873
F1 Score:  0.4730229120473023


In [57]:
LDAOnly = instadf.drop(columns=["Clickbait", "LDA Feature", "LSA Feature"])
instamodel = train_KNN(instadf, LDAOnly, "Clickbait", 7)

Accuracy: 0.5672478206724783
Precision: 0.5191176470588236
Recall: 0.4895977808599168
F1 Score:  0.5039257673090649


In [58]:
BertOnly = instadf.drop(columns=["Clickbait", "LDA Feature", "LSA Feature"])
instamodel = train_KNN(instadf, BertOnly, "Clickbait", 7)

Accuracy: 0.549813200498132
Precision: 0.486646884272997
Recall: 0.4652482269503546
F1 Score:  0.4757070340826686


In [59]:
LSAOnly = instadf.drop(columns=["Clickbait", "LDA Feature", "Bert Feature"])
instamodel = train_KNN(instadf, LSAOnly, "Clickbait", 7)

Accuracy: 0.5579078455790785
Precision: 0.49765258215962443
Recall: 0.4497878359264498
F1 Score:  0.4725111441307578


In [60]:
pip install pandas tensorflow scikit-learn #Installing Packages

Collecting numpy>=1.16.5 (from pandas)
  Downloading numpy-1.24.3-cp39-cp39-macosx_10_9_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.3-cp39-cp39-macosx_10_9_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.6
    Uninstalling numpy-1.21.6:
      Successfully uninstalled numpy-1.21.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.5.0 requires daal==2021.4.0, which is not installed.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.24.3 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.24.3


[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
#Traning SVM on specific TM Features

In [4]:
ytmodel = train_svm(ytdf, ytdf.drop(columns=["class", "LDA Feature", "LSA Feature"]), 'class', 'rbf')

Accuracy: 0.8085106382978723
Precision: 0.9230769230769231
Recall: 0.6
F1 Score:  0.7272727272727273


In [7]:
ytmodel = train_svm(ytdf, ytdf.drop(columns=["class", "Bert Feature", "LSA Feature"]), 'class', 'rbf')

Accuracy: 0.7446808510638298
Precision: 0.8571428571428571
Recall: 0.5454545454545454
F1 Score:  0.6666666666666666


In [16]:
ytmodel = train_svm(ytdf, ytdf.drop(columns=["class", "LDA Feature", "Bert Feature"]), 'class', 'rbf')

Accuracy: 0.723404255319149
Precision: 0.8571428571428571
Recall: 0.5217391304347826
F1 Score:  0.6486486486486487


In [None]:
#Training Logistic Regression Models on specific TM Features

In [29]:
ytmodel = train_logr(ytdf, ytdf.drop(columns=["class", "LDA Feature", "LSA Feature"]), "class")

Accuracy: 0.9787234042553191
Precision: 0.9583333333333334
Recall: 1.0
F1 Score:  0.9787234042553191


In [18]:
ytmodel = train_logr(ytdf, ytdf.drop(columns=["class", "LDA Feature", "LSA Feature"]), "class")

Accuracy: 0.9574468085106383
Precision: 1.0
Recall: 0.9090909090909091
F1 Score:  0.9523809523809523


In [23]:
ytmodel = train_logr(ytdf, ytdf.drop(columns=["class", "Bert Feature", "LSA Feature"]), "class")

Accuracy: 0.9361702127659575
Precision: 1.0
Recall: 0.8695652173913043
F1 Score:  0.9302325581395349


In [32]:
ytmodel = train_logr(ytdf, ytdf.drop(columns=["class", "Bert Feature", "LDA Feature"]), "class")

Accuracy: 0.9787234042553191
Precision: 0.9615384615384616
Recall: 1.0
F1 Score:  0.9803921568627451


In [None]:
#Traning KNN Model on specific TM Features

In [36]:
ytmodel = train_KNN(ytdf, ytdf.drop(columns=["class", "LDA Feature", "LSA Feature"]), "class", 7)

Accuracy: 0.7872340425531915
Precision: 0.6842105263157895
Recall: 0.7647058823529411
F1 Score:  0.7222222222222222


In [41]:
ytmodel = train_KNN(ytdf, ytdf.drop(columns=["class", "Bert Feature", "LSA Feature"]), "class", 7)

Accuracy: 0.851063829787234
Precision: 0.8947368421052632
Recall: 0.7727272727272727
F1 Score:  0.8292682926829268


In [46]:
ytmodel = train_KNN(ytdf, ytdf.drop(columns=["class", "Bert Feature", "LDA Feature"]), "class", 7)

Accuracy: 0.851063829787234
Precision: 0.8260869565217391
Recall: 0.8636363636363636
F1 Score:  0.8444444444444444
