In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_validate

## Hyperbole Detection

This notebook contains a random forest model that has been trained on the feature engineered dataset (which was created in <b>hyperbole_feature_engineering.ipynb</b>).

In [None]:
PATH_TO_PREPARED_DATASET = "../../data/hyperbole_detection/hyperboles_feature_engineered.csv"

In [None]:
df_hyperboles = pd.read_csv(PATH_TO_PREPARED_DATASET)

In [None]:
df_hyperboles.columns

In [None]:
msk = np.random.rand(len(df_hyperboles)) < 0.8

df_train = df_hyperboles[msk]

df_test = df_hyperboles[~msk]

In [None]:
vectorizer = TfidfVectorizer()
train_tf_idf_features = vectorizer.fit_transform(df_train['german']).toarray()
test_tf_idf_features  = vectorizer.transform(df_test['german']).toarray()

# Converting above list to DataFrame
train_tf_idf = pd.DataFrame(train_tf_idf_features)
test_tf_idf = pd.DataFrame(test_tf_idf_features)

# Separating train and test labels from all features
train_Y = df_train['label']
test_Y = df_test['label']

# Listing all features
features = ['imageability', 'polarity_senti_ws',
       'polarity_text_blob', 'subjectivity_huggingface', 'vader_positive',
       'vader_neutral', 'vader_negative', 'vader_compound']

# Merging the features with above TF-IDF. 
train_tf_idf = train_tf_idf.reset_index()
df_train = df_train.reset_index()

test_tf_idf = test_tf_idf.reset_index()
df_test = df_test.reset_index()

train = pd.merge(train_tf_idf,df_train[features],left_index=True, right_index=True)
train = train.drop(columns=['index'])

test  = pd.merge(test_tf_idf,df_test[features],left_index=True, right_index=True)
test = test.drop(columns=['index'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, train_Y, test_size=0.2, random_state = 42)# Random Forest Classifier
_RandomForestClassifier = RandomForestClassifier(n_estimators = 1000, min_samples_split = 15, random_state = 42)
_RandomForestClassifier.fit(X_train, y_train)
_RandomForestClassifier_prediction = _RandomForestClassifier.predict(X_test)
val_RandomForestClassifier_prediction = _RandomForestClassifier.predict(test)

print("Accuracy => ", round(accuracy_score(_RandomForestClassifier_prediction, y_test)*100, 2))
print("\nRandom Forest Classifier results: \n")
print(classification_report(y_test, _RandomForestClassifier_prediction, target_names = ['real', 'fake']))
print("Validation Accuracy => ", round(accuracy_score(val_RandomForestClassifier_prediction, test_Y)*100, 2))
print("\nValidation Random Forest Classifier results: \n")
print(classification_report(test_Y, val_RandomForestClassifier_prediction, target_names = ['real', 'fake']))

In [None]:
train_Y.value_counts()

## Following the approach by Troiano et al

In this following section the same types of models as in Troiano et al. are going to be trained. Unfortunately we do not know which model parameters were set, therefore the default configuration is going to be used

### Defining the classifiers

Helper class that is used to create the table at the end

In [None]:
class Classifier:
    
    def __init__(self, name, clf):
        
        self.name = name
        self.clf = clf
    
    def get_name(self):
        return self.name
    
    def set_name(self,name):
        self.name = name
    
    def get_clf(self):
        return self.clf

    def set_clf(self, clf):
        self.clf = clf

All of the following classifiers besides random forest were used in the paper.

In [None]:
Lr_Classifier = Classifier("LR",LogisticRegression())
Knn_Classifier = Classifier("KNN",KNeighborsClassifier())
Nb_Classifier = Classifier("NB",GaussianNB())
Dt_Classifier = Classifier("DT", DecisionTreeClassifier())
Svm_Classifier = Classifier("SVM", make_pipeline(StandardScaler(), SVC(gamma='auto')))
Lda_Classifier = Classifier("LDA", LinearDiscriminantAnalysis())
Rf_Classifier = Classifier("RF", RandomForestClassifier())

A list containing all the classifiers which were defined above

In [None]:
list_of_classifiers = [
    Lr_Classifier, Knn_Classifier, Nb_Classifier, 
    Dt_Classifier, Svm_Classifier, Lda_Classifier, Rf_Classifier
]

### Data

In [None]:
DATASET_GOOGLE_TRANSLATED_ALL = "../../data/hyperbole_detection/hyperboles_feature_engineered.csv"

DATASET_GOOGLE_TRANSLATED_400 = "../../data/hyperbole_detection/hyperboles_google_translated_feature_engineered.csv"

DATASET_MANUALLY_TRANSLATED_400 = "../../data/hyperbole_detection/hyperboles_manually_translated_feature_engineered.csv"

The following block is used to load the different dataframes.

**df_input_machine_translated_all** contains all the sentences in machine translated form, created by google translate

**df_input_machine_translated_400** contains 400 sentences which were machine translated by google translate. This dataset was created so that a comparison between manually translated sentences and the machine translated sentences is possible

**df_input_manually_translated_400** contains the 400 manually translated sentences

In [None]:
df_input_machine_translated_all = pd.read_csv(DATASET_GOOGLE_TRANSLATED_ALL)
df_input_machine_translated_400 = pd.read_csv(DATASET_GOOGLE_TRANSLATED_400)
df_input_manually_translated_400 = pd.read_csv(DATASET_MANUALLY_TRANSLATED_400)

### Training, Testing and Evaluation

The following block contains the relevant columns for the classification

In [None]:
X_columns = ['imageability', 'polarity_senti_ws',
       'polarity_text_blob', 'subjectivity_huggingface', 'vader_positive',
       'vader_neutral', 'vader_negative', 'vader_compound']

y_column = 'label'

The functions defined below are used to train the models and to evaluate them. The results can then be presented in a table

In [None]:
def create_resulting_table(experiment_names, experiment_dfs, list_X_columns, list_y_column, classifiers):
    results = []
    
    index = 0
    
    while index < len(experiment_dfs):
        resulting_series = create_result_column(experiment_dfs[index], 
            list_X_columns[index], list_y_column[index], classifiers)
        
        results.append(resulting_series)
        index += 1
        
    resulting_df = pd.concat(results, axis=1)
    resulting_df.columns = experiment_names
    
    return resulting_df
    

def create_result_column(df, X_columns, y_column, classifiers, cv_k = 10):
    test_accuracies = []
    clf_names = get_names_from_classifiers(classifiers)
    
    X, y = get_X_and_y(df, X_columns, y_column)
    
    for classifier in classifiers:
        
        accuracy = get_mean_accuracy_from_cv(classifier.get_clf(), X, y, cv_k)
        
        test_accuracies.append(accuracy)
    
    return pd.Series(test_accuracies, index=clf_names)

def get_names_from_classifiers(classifiers):
    names = []
    
    for clf in classifiers:
        names.append(clf.get_name())
        
    return names

def get_X_and_y(df, X_columns, y_column):
    X = df.loc[:,X_columns]
    y = df[y_column]
    
    return (X, y)

def get_mean_accuracy_from_cv(clf, X, y, cv_k):
    cv_results = cross_validate(clf, X, y, cv=cv_k,scoring=('r2', 'accuracy'))
    
    return cv_results["test_accuracy"].mean()

def train_model(classifier, X_train, y_train):
    return classifier.fit(X_train, y_train)

def get_accuracy(model, X_train, X_test, y_train, y_test):
        
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, train_pred)
    test_accuracy = accuracy_score(y_test, test_pred)
    
    return (train_accuracy, test_accuracy)

In [None]:
experiment_names = ["machine_translated_all", "machine_translated_400", "manually_translated_400"]
list_dfs = [df_input_machine_translated_all, df_input_machine_translated_400,df_input_manually_translated_400]
X_columns_list = [X_columns, X_columns, X_columns]
y_column_list = [y_column, y_column, y_column]

In [None]:
t = create_resulting_table(experiment_names, list_dfs, X_columns_list, y_column_list, list_of_classifiers)

In [None]:
t

In [None]:
print(t.to_latex())