In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Hyperbole Detection

This notebook contains a random forest model that have been trained on the feature engineered dataset (which was created in <b>hyperbole_feature_engineering.ipynb</b>).

This notebook will be further enhanced in the future and should just be viewed as a starting point. 

In [None]:
PATH_TO_PREPARED_DATASET = "../../data/hyperbole_detection/hyperboles_feature_engineered.csv"

In [None]:
df_hyperboles = pd.read_csv(PATH_TO_PREPARED_DATASET)

In [None]:
df_hyperboles.columns

In [None]:
msk = np.random.rand(len(df_hyperboles)) < 0.8

df_train = df_hyperboles[msk]

df_test = df_hyperboles[~msk]

In [None]:
vectorizer = TfidfVectorizer()
train_tf_idf_features = vectorizer.fit_transform(df_train['german']).toarray()
test_tf_idf_features  = vectorizer.transform(df_test['german']).toarray()

# Converting above list to DataFrame
train_tf_idf = pd.DataFrame(train_tf_idf_features)
test_tf_idf = pd.DataFrame(test_tf_idf_features)

# Separating train and test labels from all features
train_Y = df_train['label']
test_Y = df_test['label']

# Listing all features
features = ['imageability', 'polarity_senti_ws',
       'polarity_text_blob', 'subjectivity_text_blob', 'vader_positive',
       'vader_neutral', 'vader_negative', 'vader_compound']

# Merging the features with above TF-IDF. 
train_tf_idf = train_tf_idf.reset_index()
df_train = df_train.reset_index()

test_tf_idf = test_tf_idf.reset_index()
df_test = df_test.reset_index()

train = pd.merge(train_tf_idf,df_train[features],left_index=True, right_index=True)
train = train.drop(columns=['index'])

test  = pd.merge(test_tf_idf,df_test[features],left_index=True, right_index=True)
test = test.drop(columns=['index'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, train_Y, test_size=0.2, random_state = 42)# Random Forest Classifier
_RandomForestClassifier = RandomForestClassifier(n_estimators = 1000, min_samples_split = 15, random_state = 42)
_RandomForestClassifier.fit(X_train, y_train)
_RandomForestClassifier_prediction = _RandomForestClassifier.predict(X_test)
val_RandomForestClassifier_prediction = _RandomForestClassifier.predict(test)

print("Accuracy => ", round(accuracy_score(_RandomForestClassifier_prediction, y_test)*100, 2))
print("\nRandom Forest Classifier results: \n")
print(classification_report(y_test, _RandomForestClassifier_prediction, target_names = ['real', 'fake']))
print("Validation Accuracy => ", round(accuracy_score(val_RandomForestClassifier_prediction, test_Y)*100, 2))
print("\nValidation Random Forest Classifier results: \n")
print(classification_report(test_Y, val_RandomForestClassifier_prediction, target_names = ['real', 'fake']))

In [None]:
train_Y.value_counts()