In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split


In [None]:
pd.reset_option('^display.', silent=True)

# Load half the data and separate target from predictors
X = pd.read_csv('../input/hatred-on-twitter-during-metoo-movement/MeTooHate.csv', nrows=300000)
X.dropna(axis=0, subset=['text', 'category'], inplace=True)
y = X.category
X.drop(['category'], axis=1, inplace=True)

# Drop columns not used for modelling
cols_to_drop = ['status_id', 'created_at', 'location']
X.drop(cols_to_drop, axis=1, inplace=True)

# Split the data while maintaining the proportion of hate/non-hate (stratify) 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)


In [None]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
X_test_stats = X_test.copy()

print("Total training samples:", len(X_train))
print("Total test samples:", len(X_test))

X_train.head(10)

In [None]:
X_train.describe()

In [None]:

y_train

In [None]:
sample_index = 25
print(X_train.iloc[sample_index])

In [None]:
y_train.value_counts().plot(kind='bar')


In [None]:
# import matplotlib.pyplot as plt 
# plt.rc("font", size=14)
# import seaborn as sns
# sns.set(style="white") #white background style for seaborn plots
# sns.set(style="whitegrid", color_codes=True)

# plt.figure(figsize=(15,8))
# ax = sns.kdeplot(X_train["favourite_count"][y_train == 1], color="darkturquoise", shade=True)
# sns.kdeplot(X_train["favourite_count"][y_train == 0], color="lightcoral", shade=True)
# plt.legend(['hate' , 'non - hate'])
# plt.title('Density Plot of Fare for Surviving Population and Deceased Population')
# ax.set(xlabel='followers_count')
# plt.xlim(-20,200)
# plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b',
                             lowercase=True, stop_words='english')
X_train_cv = cv.fit_transform(X_train.text)
X_test_cv = cv.transform(X_test.text)

In [None]:
# Scale numerical features (followers, retweets etc.)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
cols = ['favorite_count', 'retweet_count', 'followers_count', 'friends_count', 'statuses_count']
X_train_sc = scaler.fit_transform(X_train[cols])
X_test_sc = scaler.transform(X_test[cols])

In [None]:
# Merge the numerical features with our count vectors
import scipy.sparse as sp
train_count = sp.csr_matrix(X_train_cv)
train_num = sp.csr_matrix(X_train_sc)
X_train = sp.hstack([train_count, train_num])

test_count = sp.csr_matrix(X_test_cv)
test_num = sp.csr_matrix(X_test_sc)
X_test = sp.hstack([test_count, test_num])

# Save top words for training set
word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())

In [None]:
word_freq_df.sum()

In [None]:
#top 20 words
pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False).head(20)


In [None]:
#navie bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

In [None]:
clf.score(X_test , y_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix
n_classes = 2
cm = confusion_matrix(y_test, predictions, labels=range(n_classes))

print(f'Number of samples to classify: {len(X_test.toarray())}\n')
print(f'Accuracy score: {accuracy_score(y_test, predictions)}')
print(f'Precision score: {precision_score(y_test, predictions)}')
print(f'Recall score: {recall_score(y_test, predictions)}\n')
print(f'f1 score: {f1_score(y_test, predictions)}\n')
print(f'Confusion matrix: \n{cm}')

In [None]:
!pip install yellowbrick


In [None]:
from yellowbrick.model_selection import learning_curve
print(learning_curve(clf, X_train, y_train, cv=10, scoring='accuracy'))

In [None]:
plt.figure(figsize=(6,6))
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm, square=True, annot=True, cbar=False,
            xticklabels=['non-hate', 'hate'], yticklabels=['non-hate', 'hate'])
plt.xlabel('Predicted label')
plt.ylabel('True label')

In [None]:
# Plot the ROC curve for the MNB classifier
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, predictions)
plt.figure(figsize=(8,8))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='MNB')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()

In [None]:
# Show how the first 50 test tweets were classified and their true label
testing_predictions = []
for i in range(len(X_test.toarray())):
    if predictions[i] == 1:
        testing_predictions.append('Hate')
    else:
        testing_predictions.append('Non-hate')
check_df = pd.DataFrame({'actual_label': list(y_test), 'prediction': testing_predictions, 'text':list(X_test_stats.text)})

In [None]:
check_df.replace(to_replace=0, value='Non-hate', inplace=True)
check_df.replace(to_replace=1, value='Hate', inplace=True)

In [None]:
check_df.iloc[:50]

In [None]:
from sklearn.linear_model import LogisticRegression 
clf1 = LogisticRegression(solver='lbfgs', max_iter=1000)
clf1.fit(X_train, y_train)
predictions = clf1.predict(X_test)

In [None]:
clf1.score(X_test, y_test)

In [None]:
print(learning_curve(clf1, X_train, y_train, cv=10000, scoring='accuracy'))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
n_classes = 2
cm_log = confusion_matrix(y_test, predictions, labels=range(n_classes))

print(f'Number of samples to classify: {len(X_test.toarray())}\n')
print(f'Accuracy score: {accuracy_score(y_test, predictions)}')
print(f'Precision score: {precision_score(y_test, predictions)}')
print(f'Recall score: {recall_score(y_test, predictions)}\n')
print(f'f1 score: {f1_score(y_test, predictions)}\n')
print(f'Confusion matrix: \n{cm}')

In [None]:
from yellowbrick.classifier import ClassificationReport
#from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
visualizer = ClassificationReport(model)

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
#from sklearn.naive_bayes import MultinomialNB

#from yellowbrick.datasets import load_occupancy
from yellowbrick.model_selection import CVScores

# Load the classification dataset

# Create a cross-validation strategy
cv = StratifiedKFold(n_splits=12, random_state=42, shuffle = True)

# Instantiate the classification model and visualizer
model = MultinomialNB()
visualizer = CVScores(model, cv=cv, scoring='f1_weighted')

visualizer.fit(X_test, y_test)        # Fit the data to the visualizer
visualizer.show()   

In [None]:
# from sklearn import svm
# clf2 = svm.SVC(kernel = 'linear')
# clf2.fit(X_train , y_train)
# y_pred = clf2.predict(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=100)
knn.fit(X_train, y_train)


In [None]:
y_pred = knn.predict(X_test)