In [1]:
import pandas as pd
import os
import numpy as np
from datetime import datetime, timedelta, timezone
from time import sleep

# Data loading

In [2]:
path = os.getcwd()
path_to_folder = os.path.join(path, "data")

# class 1 represents toxic tweets, class 0 - normal tweets
dataset1 = pd.read_csv(os.path.join(path_to_folder, "toxic-tweets-dataset.csv"), encoding='utf-8')
dataset1.head()

Unnamed: 0.1,Unnamed: 0,Toxicity,tweet
0,0,0,@user when a father is dysfunctional and is s...
1,1,0,@user @user thanks for #lyft credit i can't us...
2,2,0,bihday your majesty
3,3,0,#model i love u take with u all the time in ...
4,4,0,factsguide: society now #motivation


In [3]:
len(dataset1["tweet"].unique())

54313

In [4]:
dataset1.shape

(56745, 3)

In [5]:
dataset1 = dataset1.drop_duplicates(subset = ["tweet"])
dataset1.shape

(54313, 3)

In [6]:
#class 0 - hate speech, class 1 - offensive language, class 2 - neither
dataset2 = pd.read_csv(os.path.join(path_to_folder, "hate-speech-and-offensive-language-dataset.csv"))
dataset2.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [7]:
dataset2.shape

(24783, 7)

In [8]:
len(dataset2["tweet"].unique())

24783

# Data preprocessing

## Datasets merging

In [9]:
dataset1 = dataset1.drop(dataset1.columns[0], axis = 1)

In [10]:
# tweet is toxic(1) or not(0)
dataset1 = dataset1.rename(columns={"Toxicity": "class"})

In [11]:
dataset2 = dataset2[["class", "tweet"]]
# 4163 non hate and offensive tweets
dataset2["class"].value_counts()

1    19190
2     4163
0     1430
Name: class, dtype: int64

In [12]:
dataset2.loc[dataset2["class"] == 0, "class"] = 1
dataset2.loc[dataset2["class"] == 2, "class"] = 0

In [13]:
dataset2["class"].value_counts()

1    20620
0     4163
Name: class, dtype: int64

# Now class 0 is normal tweets and class 1 is toxic/hate/offensive tweets

In [14]:
df = pd.concat([dataset1, dataset2])
df["class"].value_counts()

1    44544
0    34552
Name: class, dtype: int64

In [15]:
df.reset_index(inplace = True, drop=True)

In [16]:
df.head()

Unnamed: 0,class,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


Slightly imbalanced (55% normal tweets vs 45% toxic tweets)

## Data cleaning and transformation

### Punctuation, URLs and tags removal

In [17]:
# deleted @users, #hashtags and non-supported (non-alphanumeric) characters
# PROBLEM: (word) is also deteled
def clean_text(X):
    X = X.lower().split()
    X_clean = filter(str.isalnum, X)
    return ' '.join(X_clean)

result = []
for row in df.loc[:,"tweet"]:
    result.append(clean_text(row))

In [18]:
df["clean_tweet"] = result

In [19]:
df.head()

Unnamed: 0,class,tweet,clean_tweet
0,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i use cause they offer wheel...
2,0,bihday your majesty,bihday your majesty
3,0,#model i love u take with u all the time in ...,i love u take with u all the time in
4,0,factsguide: society now #motivation,society now


### Stop words removal

In [20]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
newStopWords = ["rt"]
stopwords_list.extend(newStopWords)

def remove_stop_words(X):
    X = X.split()
    X_new = [x for x in X if x not in stopwords_list]
    return ' '.join(X_new)

result = []
for row in df.loc[:,"clean_tweet"]:
    result.append(remove_stop_words(row))
    
df["final_tweet"] = result

In [21]:
df.drop(columns = ["tweet", "clean_tweet"], inplace = True)
df.head()

Unnamed: 0,class,final_tweet
0,0,father dysfunctional selfish drags kids
1,0,thanks credit use cause offer wheelchair vans
2,0,bihday majesty
3,0,love u take u time
4,0,society


Some values are non NA but empty strings (as the result of data preprocessing, so they should be deleted):

In [22]:
nan_value = float("NaN")
df.replace("", nan_value, inplace=True)
df.dropna(subset = ["final_tweet"], inplace=True)
df.reset_index(drop=True, inplace=True)

In [23]:
df["class"].value_counts()

1    44171
0    33054
Name: class, dtype: int64

In [24]:
df.to_csv(os.path.join(path_to_folder, "final_dataset.csv"), encoding='utf-8', index = False)

# Feature extraction and training of ML models

## Simple count method

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
cv = CountVectorizer()

In [26]:
X_cv = cv.fit_transform(df["final_tweet"])

In [27]:
X_cv.shape

(77225, 26894)

In [28]:
from sklearn import model_selection, naive_bayes, svm
train_x, test_x, train_y, test_y = model_selection.train_test_split(X_cv, df["class"], test_size = 0.25)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

LG = LogisticRegression(max_iter = 500)
param_grid_lg = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
grid_lg = GridSearchCV(LG, param_grid_lg, cv = 10)

# fitting the model for grid search 
grid_lg.fit(train_x, train_y) 
 
# print best parameter after tuning 
print(grid_lg.best_params_) 
grid_predictions = grid_lg.predict(test_x) 
   
# print classification report 
print(classification_report(test_y, grid_predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 10.0, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      8213
           1       0.93      0.90      0.91     11094

    accuracy                           0.90     19307
   macro avg       0.90      0.90      0.90     19307
weighted avg       0.90      0.90      0.90     19307



In [30]:
NB = naive_bayes.MultinomialNB()
NB.fit(train_x, train_y)
predictions = NB.predict(test_x)

In [31]:
print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

           0       0.88      0.83      0.85      8213
           1       0.88      0.91      0.89     11094

    accuracy                           0.88     19307
   macro avg       0.88      0.87      0.87     19307
weighted avg       0.88      0.88      0.88     19307



In [32]:
SVM = svm.SVC(kernel='linear')
SVM.fit(train_x, train_y)
predictions = SVM.predict(test_x)

In [33]:
from sklearn.metrics import classification_report
print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89      8213
           1       0.93      0.89      0.91     11094

    accuracy                           0.90     19307
   macro avg       0.90      0.90      0.90     19307
weighted avg       0.90      0.90      0.90     19307



## TF-IDF method

In [34]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df["final_tweet"])

In [35]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(X_tfidf, df["class"], test_size = 0.25)

In [36]:
LG.fit(train_x, train_y)
predictions = LG.predict(test_x)

In [37]:
print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

           0       0.84      0.94      0.88      8274
           1       0.95      0.86      0.90     11033

    accuracy                           0.89     19307
   macro avg       0.89      0.90      0.89     19307
weighted avg       0.90      0.89      0.90     19307



In [38]:
NB.fit(train_x, train_y)
predictions = NB.predict(test_x)

In [39]:
print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

           0       0.90      0.73      0.80      8274
           1       0.82      0.94      0.88     11033

    accuracy                           0.85     19307
   macro avg       0.86      0.83      0.84     19307
weighted avg       0.85      0.85      0.84     19307



In [40]:
SVM.fit(train_x, train_y)
predictions = SVM.predict(test_x)

In [43]:
print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89      8717
           1       0.95      0.87      0.91     11057

    accuracy                           0.90     19774
   macro avg       0.90      0.90      0.90     19774
weighted avg       0.90      0.90      0.90     19774



# Saving the best model

In [29]:
import pickle
Pkl_Filename = "svm_model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(SVM, file)

# Example of applying the model to unseen tweets

## Setting up Twitter connection

https://developer.twitter.com/en/docs/tutorials/step-by-step-guide-to-making-your-first-request-to-the-twitter-api-v2

In [31]:
search_term = input("Type the topic: ")

Type the topic: cat


In [32]:
#start_time = (datetime.now(timezone.utc).astimezone() - timedelta(seconds=50)).isoformat()
end_time = (datetime.now(timezone.utc).astimezone() - timedelta(seconds=10)).isoformat()

In [33]:
end_time

'2022-04-21T11:15:37.303057+02:00'

In [34]:
import requests
import os
import json

bearer_token = "AAAAAAAAAAAAAAAAAAAAAKovZQEAAAAAWKm9VUw8iFuu27OQ1m34HwIR5VY%3DbPl1vQZq69vtsvc5JPq5HD9AiatIKCw58RBeXOugqsl5Holcog"

search_url = "https://api.twitter.com/2/tweets/search/recent" #Change to the endpoint you want to collect data from

#change params based on the endpoint you are using
query_params = {'query': search_term,
                #'start_time': start_time,
                'end_time': end_time,
                'max_results': '100',
                'tweet.fields': 'text,geo,created_at,lang,public_metrics,source',
                'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
                'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
                'next_token': {}}


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params, next_token = None):
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()


json_response = connect_to_endpoint(search_url, query_params)

In [36]:
json_response

{'data': [{'source': 'Twitter for iPhone',
   'text': '.@sea_cat_boo https://t.co/wBg9SGDZHZ',
   'created_at': '2022-04-21T09:15:36.000Z',
   'public_metrics': {'retweet_count': 0,
    'reply_count': 0,
    'like_count': 0,
    'quote_count': 0},
   'id': '1517069532451545088',
   'lang': 'und'},
  {'source': 'Twitter Web App',
   'text': 'RT @NewVoiceUkraine: Do you remember the charred cat from Kyiv region, the photo of which flew around the world? He has a new life now 🐈❤…',
   'created_at': '2022-04-21T09:15:36.000Z',
   'public_metrics': {'retweet_count': 54,
    'reply_count': 0,
    'like_count': 0,
    'quote_count': 0},
   'id': '1517069531562319872',
   'lang': 'en'},
  {'source': 'Twitter for iPhone',
   'text': 'RT @Lemon0517ch: リモート会議してるとこうなるので困ってます...\n\n#猫\u3000#cat https://t.co/96kbi7zuM7',
   'created_at': '2022-04-21T09:15:36.000Z',
   'public_metrics': {'retweet_count': 5687,
    'reply_count': 0,
    'like_count': 0,
    'quote_count': 0},
   'id': '151706953128987

## Loading and cleaning the data

In [35]:
tweets = pd.DataFrame.from_dict(json_response["data"])

In [37]:
tweets

Unnamed: 0,source,text,created_at,public_metrics,id,lang
0,Twitter for iPhone,.@sea_cat_boo https://t.co/wBg9SGDZHZ,2022-04-21T09:15:36.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",1517069532451545088,und
1,Twitter Web App,RT @NewVoiceUkraine: Do you remember the charr...,2022-04-21T09:15:36.000Z,"{'retweet_count': 54, 'reply_count': 0, 'like_...",1517069531562319872,en
2,Twitter for iPhone,RT @Lemon0517ch: リモート会議してるとこうなるので困ってます...\n\n#...,2022-04-21T09:15:36.000Z,"{'retweet_count': 5687, 'reply_count': 0, 'lik...",1517069531289878528,ja
3,Twitter for Android,RT @iSabadellcat: [Els finalistes del Premi Sa...,2022-04-21T09:15:36.000Z,"{'retweet_count': 2, 'reply_count': 0, 'like_c...",1517069530605961218,ca
4,Twitter for Android,RT @villannefer: school of the cat https://t.c...,2022-04-21T09:15:36.000Z,"{'retweet_count': 20, 'reply_count': 0, 'like_...",1517069530341773313,en
...,...,...,...,...,...,...
95,Twitter for iPhone,RT @Alabiistired: The cat ownership in Lagos i...,2022-04-21T09:15:23.000Z,"{'retweet_count': 78, 'reply_count': 0, 'like_...",1517069477925376000,en
96,Twitter for iPhone,bts also supports the cat agenda and we are th...,2022-04-21T09:15:23.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",1517069477346643969,en
97,Twitter for Android,RT @vkoobuns: never ending saga of yoongi cat ...,2022-04-21T09:15:23.000Z,"{'retweet_count': 5297, 'reply_count': 0, 'lik...",1517069477158223872,en
98,Twitter for Android,RT @SBolsri: เป็นคลิปแรกของช่อง STUDIO CHOOM\n...,2022-04-21T09:15:23.000Z,"{'retweet_count': 207, 'reply_count': 0, 'like...",1517069476432596992,th


In [97]:
tweets = tweets[tweets.lang == 'en']

In [98]:
result = []
for row in tweets.loc[:,"text"]:
    result.append(clean_text(row))
tweets["clean_tweet"] = result

result = []
for row in tweets.loc[:,"clean_tweet"]:
    result.append(remove_stop_words(row))
    
tweets["final_tweet"] = result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets["clean_tweet"] = result
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets["final_tweet"] = result


In [99]:
tweets

Unnamed: 0,text,id,lang,created_at,public_metrics,source,withheld,clean_tweet,final_tweet
1,RT @OriginalRamayan: @KyivIndependent 'SpaceX ...,1516443905364955137,en,2022-04-19T15:49:35.000Z,"{'retweet_count': 2, 'reply_count': 0, 'like_c...",Twitter for iPhone,,rt starlink terminals sent to ukraine cost us ...,starlink terminals sent ukraine cost us taxpay...
2,RT @_InThisTogether: And the title of the link...,1516443905180553223,en,2022-04-19T15:49:35.000Z,"{'retweet_count': 14, 'reply_count': 0, 'like_...",Twitter for iPad,,rt and the title of the linked imf warns of so...,title linked imf warns social unrest effects a...
4,@Ahmedmaq00 @Arm_Of_Bucky Ukraine,1516443905042300930,en,2022-04-19T15:49:35.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Twitter for iPhone,,ukraine,ukraine
5,RT @bulatov00: the world and history will take...,1516443904773603337,en,2022-04-19T15:49:35.000Z,"{'retweet_count': 2, 'reply_count': 0, 'like_c...",Twitter for Android,,rt the world and history will take from russia...,world history take russia much russian missile...
6,RT @simon_schama: In a wide-ranging conversati...,1516443904056340485,en,2022-04-19T15:49:35.000Z,"{'retweet_count': 169, 'reply_count': 0, 'like...",Twitter Web App,,rt in a conversation at his compound in volody...,conversation compound volodymyr zelensky tells...
...,...,...,...,...,...,...,...,...,...
95,@JoeBiden Dear Joe Biden need support to Ukrai...,1516443883240140804,en,2022-04-19T15:49:30.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Twitter for Android,,dear joe biden need support to i am advisor of...,dear joe biden need support advisor know wrote...
96,@RT_com's account has been withheld in Portuga...,1516443882518679561,en,2022-04-19T15:49:30.000Z,"{'retweet_count': 150, 'reply_count': 0, 'like...",Twitter for iPhone,"{'copyright': False, 'country_codes': ['AT', '...",account has been withheld in czech belgium in ...,account withheld czech belgium response legal ...
97,RT @RepSwalwell: Trump has always rooted for R...,1516443882136883203,en,2022-04-19T15:49:30.000Z,"{'retweet_count': 2193, 'reply_count': 0, 'lik...",Twitter for iPhone,,rt trump has always rooted for he was an ameri...,trump always rooted american essentially worki...
98,RT @JackDetsch: JUST IN: US is set to deliver ...,1516443882095267843,en,2022-04-19T15:49:30.000Z,"{'retweet_count': 53, 'reply_count': 0, 'like_...",Twitter for Android,,rt just us is set to deliver seven more flight...,us set deliver seven flights military aid ukra...


## Feature extraction

In [100]:
#cv = CountVectorizer()
tweets_cv = cv.transform(tweets["final_tweet"])

In [101]:
tweets_cv.shape

(85, 26894)

## Classification

In [102]:
predictions = LG.predict(tweets_cv)

In [103]:
# class 0 is normal tweets and class 1 is toxic/hate/offensive tweets 
np.unique(predictions, return_counts=True)

(array([0, 1], dtype=int64), array([68, 17], dtype=int64))

In [104]:
for i in range(len(tweets)):
	print(tweets.iloc[i, 8], predictions[i])

starlink terminals sent ukraine cost us taxpayers report money really well 0
title linked imf warns social unrest effects actions 0
ukraine 0
world history take russia much russian missiles take every lost life 0
conversation compound volodymyr zelensky tells atlantic ukraine needs 0
ukraine hello name anna ukrainian artist currently works 0
due counteroffensive donetsk armed forces restored full control maryinka key frontline towns since several days ago russians tried seize general staff 0
even tell much russians loved show screenshot 0
us clear russia keeps calling us allies enemies threatening obvious patriotism translates support translates support russia 1
know making new heart think going happen 0
attack brutal attack war aggression violating human 0
nationalist writer nikolai starikov says ordinary russians want country tougher ukraine citizens 1
netherlands send heavy including armoured vehicles ukraine 0
actual alpha men fighting genocide tuckums friends 0
william chang chris