In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import pickle
import os
import json
import datetime
from config import config
import contractions
import pickle
import mysql.connector
from mysql.connector import Error
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif, RFE
import statsmodels.api as sm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score,f1_score
from sklearn.metrics import auc, average_precision_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from string import punctuation
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("white")
plt.rcParams['figure.figsize'] = (18, 5)
import warnings
warnings.simplefilter('ignore', FutureWarning)
warnings.simplefilter('ignore', UserWarning)



In [2]:
class ToPandasDF():
    def __init__(self, password, host, database, user):

        self.password = password
        self.host = host
        self.database = database
        self.user = user
        
    
    def MySQLconnect(self, query):
        
        try:
            connection = mysql.connector.connect(host=self.host, 
                                                 database=self.database, 
                                                 password=self.password,
                                                 user=self.user)

            if connection.is_connected():

                print("Successfully connected to the database\n")

                cursor = connection.cursor()
                query = query
                cursor.execute(query)

                data = cursor.fetchall()

                df = pd.DataFrame(data, columns = ['id', 'date', 'tweet'])
        except Error as e:
            print(e)
            
        cursor.close()
        connection.close()
        
        return df
    
    def check_if_valid_data(self, data):
        
        # Create a timestamp of the date(Day, Month & Year)
        data['timestamp'] = data['date'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))
        
        if data.empty:
            print("No tweets downloaded. Finishing execution")
            
        if data['id'].unique().all():
            pass
        else:
            print(f"Primary Key check is violated, Number of duplicate values: {data.duplicated().sum()}")
            
        if data.isnull().values.any():
            print(f"\nNull values detected, Number of null: \n{data.isnull().sum()}")
        
        yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
        yesterday = yesterday.replace(hour=0, minute=0, second=0, microsecond=0)
        timestamps = data['timestamp'].tolist()
        for timestamp in timestamps:
            if datetime.datetime.strptime(timestamp, '%Y-%m-%d') != yesterday:
                print("Atleast one of the returned tweet does not come from within the last 24 hours")


if __name__ == '__main__':
    
    t = ToPandasDF(config.PASSWORD, config.HOST, config.DATABASE, config.USER)
    data = t.MySQLconnect("SELECT id, created_at, tweet FROM `twitterdb`.`twitter_table`;")
    t.check_if_valid_data(data)

Successfully connected to the database

Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the returned tweet does not come from within the last 24 hours
Atleast one of the retur

In [3]:
print(data.shape)
data.head(10)

(3799, 4)


Unnamed: 0,id,date,tweet,timestamp
0,1541844184973320197,2022-06-28 18:01:14,There are some exciting changes happening over...,2022-06-28
1,1541896143445139456,2022-06-28 21:27:42,Old wh ite #limpDick men controlling young ute...,2022-06-28
2,1541916307826028545,2022-06-28 22:47:49,My intuition really be scaring me sometimes. I...,2022-06-28
3,1542127035480285185,2022-06-29 12:45:11,hi there!!! cats are feeling excited and calm....,2022-06-29
4,1542135888766967808,2022-06-29 13:20:22,Blog Stagflation requires more than just infla...,2022-06-29
5,1542230260028248064,2022-06-29 19:35:21,I just wanna feel🥺 #depressed #longdrives #int...,2022-06-29
6,1542422260526551043,2022-06-30 08:18:18,@Melbcity @denazifyru @PonteZico @SMccoull @Ve...,2022-06-30
7,1542511836255334401,2022-06-30 14:14:14,Nine people six of whom are #supremacists none...,2022-06-30
8,1542511964848435201,2022-06-30 14:14:45,RT @maysoonzayid: Nine people six of whom are ...,2022-06-30
9,1542512421398425600,2022-06-30 14:16:34,RT @maysoonzayid: Nine people six of whom are ...,2022-06-30


In [4]:
# These are duplicated rows
data.duplicated().sum()
# # Data with no duplicate
# data=data[~data.duplicated()]

# The data with usernames in the tweets column
data['tweet'].head(10)

0    There are some exciting changes happening over...
1    Old wh ite #limpDick men controlling young ute...
2    My intuition really be scaring me sometimes. I...
3    hi there!!! cats are feeling excited and calm....
4    Blog Stagflation requires more than just infla...
5    I just wanna feel🥺 #depressed #longdrives #int...
6    @Melbcity @denazifyru @PonteZico @SMccoull @Ve...
7    Nine people six of whom are #supremacists none...
8    RT @maysoonzayid: Nine people six of whom are ...
9    RT @maysoonzayid: Nine people six of whom are ...
Name: tweet, dtype: object

In [5]:
# We made the usernames anonymous in the tweet column
data['tweet'] = data.tweet.replace(regex=re.compile(r"@([A-Za-z0-9_]+)"), value='@user')
data.tweet.head(10)

0    There are some exciting changes happening over...
1    Old wh ite #limpDick men controlling young ute...
2    My intuition really be scaring me sometimes. I...
3    hi there!!! cats are feeling excited and calm....
4    Blog Stagflation requires more than just infla...
5    I just wanna feel🥺 #depressed #longdrives #int...
6    @user @user @user @user @user so guys, what's ...
7    Nine people six of whom are #supremacists none...
8    RT @user: Nine people six of whom are #suprema...
9    RT @user: Nine people six of whom are #suprema...
Name: tweet, dtype: object