In [None]:
from __future__ import print_function, division

from GatherData.config import PASS, LOGIN

import certifi

import numpy as np
import pandas as pd

import statsmodels.api as sm
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import warnings

from pymongo import MongoClient

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
sns.set()

# Step 1
## Get the data from MongoDB

In [8]:
# Loading data from mongoDB
ca = certifi.where()

client = MongoClient(f"mongodb+srv://{LOGIN}:{PASS}@cluster0.psdqkii.mongodb.net/Twitter", tlsCAFile=ca)
db = client["Ukraine_war"]
collection = db["Putin"]

In [60]:
query = {}
cursor = collection.find(query)
df = pd.DataFrame(list(cursor))
# Drop duplicates by id to only get different text data
df.drop_duplicates(subset=["id"],inplace=True)


In [61]:
df

Unnamed: 0,_id,created_at,id,text,retweet_count,favorite_count,hashtag,screen_name,name,favourites_count,friends_count,followers_count,statuses_count,verified,user_created_at
0,637fe25c077f062d5f873b5e,2022-11-24 19:29:59+00:00,1595862310953820163,Unbelievable! Was this interview supposed to p...,1,8.0,#TheOneShow,,,,,,,,NaT
1,637fe25c077f062d5f873b61,2022-11-24 19:25:37+00:00,1595861213409599488,The girls really let themselves down☹#TheOneShow,0,2.0,#TheOneShow,,,,,,,,NaT
2,637fe25c077f062d5f873b5c,2022-11-24 20:49:24+00:00,1595882299593211904,I found the chaos of tonight's #TheOneShow hil...,0,0.0,#TheOneShow,,,,,,,,NaT
3,637fe25c077f062d5f873b63,2022-11-24 19:21:56+00:00,1595860284811378696,Is #RonanKeating putting on that #accent?? #th...,0,0.0,#TheOneShow,,,,,,,,NaT
4,637fe25c077f062d5f873b5b,2022-11-24 21:03:18+00:00,1595885797768859648,#theoneshow Has all the Botox gone to Mel B's ...,0,1.0,#TheOneShow,,,,,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100896,6387ca582999ab3d122ba242,2022-11-30 19:38:35+00:00,1598038801859088384,That penalty is an absolute joke!! Terrible!! ...,0,0.0,#POLARG,,,,,,,,NaT
100897,6387ca582999ab3d122ba244,2022-11-30 19:38:35+00:00,1598038801443860480,That is possibly one of the worst penalty deci...,0,1.0,#POLARG,,,,,,,,NaT
100898,6387ca582999ab3d122ba25d,2022-11-30 19:38:34+00:00,1598038797916450816,That's a terrible decision!!! #POLARG,0,4.0,#POLARG,,,,,,,,NaT
100899,6387ca582999ab3d122ba271,2022-11-30 19:38:33+00:00,1598038795160412160,"How in the world of holy fuck, is that a penal...",0,4.0,#POLARG,,,,,,,,NaT


# Step 1
## Preproces text data

In [49]:
import nltk
import emoji

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Return tokenized text as Twitter have different Hashes and mentions
from nltk.tokenize import TweetTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\modze\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
df_text = df["text"]
# emoji.demojize(df_text[2]) idea for later


In [66]:
# Text preprocessing function
def preprocess_text(text, stem=False):
    tokens = []
    print(text)

In [67]:
df_text = df_text.apply(lambda x: preprocess_text(x))

0         Unbelievable! Was this interview supposed to p...
1          The girls really let themselves down☹#TheOneShow
2         I found the chaos of tonight's #TheOneShow hil...
3         Is #RonanKeating putting on that #accent?? #th...
4         #theoneshow Has all the Botox gone to Mel B's ...
                                ...                        
100896    That penalty is an absolute joke!! Terrible!! ...
100897    That is possibly one of the worst penalty deci...
100898                That's a terrible decision!!! #POLARG
100899    How in the world of holy fuck, is that a penal...
100900      #t5m2 Moje sny be like: https://t.co/LsWYQlY4kB
Name: text, Length: 95557, dtype: object

Unnamed: 0,_id,created_at,id,text,retweet_count,favorite_count,hashtag,screen_name,name,favourites_count,friends_count,followers_count,statuses_count,verified,user_created_at
0,637fe25c077f062d5f873b5e,2022-11-24 19:29:59+00:00,1595862310953820163,Unbelievable! Was this interview supposed to p...,1,8.0,#TheOneShow,,,,,,,,NaT
1,637fe25c077f062d5f873b61,2022-11-24 19:25:37+00:00,1595861213409599488,The girls really let themselves down☹#TheOneShow,0,2.0,#TheOneShow,,,,,,,,NaT
2,637fe25c077f062d5f873b5c,2022-11-24 20:49:24+00:00,1595882299593211904,I found the chaos of tonight's #TheOneShow hil...,0,0.0,#TheOneShow,,,,,,,,NaT
3,637fe25c077f062d5f873b63,2022-11-24 19:21:56+00:00,1595860284811378696,Is #RonanKeating putting on that #accent?? #th...,0,0.0,#TheOneShow,,,,,,,,NaT
4,637fe25c077f062d5f873b5b,2022-11-24 21:03:18+00:00,1595885797768859648,#theoneshow Has all the Botox gone to Mel B's ...,0,1.0,#TheOneShow,,,,,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100896,6387ca582999ab3d122ba242,2022-11-30 19:38:35+00:00,1598038801859088384,That penalty is an absolute joke!! Terrible!! ...,0,0.0,#POLARG,,,,,,,,NaT
100897,6387ca582999ab3d122ba244,2022-11-30 19:38:35+00:00,1598038801443860480,That is possibly one of the worst penalty deci...,0,1.0,#POLARG,,,,,,,,NaT
100898,6387ca582999ab3d122ba25d,2022-11-30 19:38:34+00:00,1598038797916450816,That's a terrible decision!!! #POLARG,0,4.0,#POLARG,,,,,,,,NaT
100899,6387ca582999ab3d122ba271,2022-11-30 19:38:33+00:00,1598038795160412160,"How in the world of holy fuck, is that a penal...",0,4.0,#POLARG,,,,,,,,NaT
