## Twitter Data Cleaning in Python

In [1]:
!pip install snscrape



In [15]:
#Import libraries
import pandas as pd
from time import sleep
from tqdm import tqdm
import re
import nltk
import snscrape.modules.twitter as sntwitter

In [3]:
#Add query
query = "(#PeterObi OR #PeterObi2023) until:2022-06-26 since:2022-05-01"
tweets =[]
limit = 30000


for tweet in sntwitter.TwitterHashtagScraper(query).get_items():
    if len(tweets)== limit:
        break
    else:
        tweets.append([tweet.date, tweet.url, tweet.user.username, tweet.sourceLabel, tweet.user.location, tweet.content, tweet.likeCount, tweet.retweetCount, tweet.quoteCount, tweet.replyCount])
#Load to DataFrame  
        df1 = pd.DataFrame(tweets, columns = ['Date', 'TweetUrl', 'User', 'Source', 'Location', 'Tweet', 'Likes_Count', 'Retweet_Count', 'Quote_Count', 'Reply_Count'])
#Load to csv file                                               
df1.to_csv('snscrape_election.csv')

In [4]:
df1.head()

Unnamed: 0,Date,TweetUrl,User,Source,Location,Tweet,Likes_Count,Retweet_Count,Quote_Count,Reply_Count
0,2022-06-25 23:58:50+00:00,https://twitter.com/Plato48466760/status/15408...,Plato48466760,Twitter for Android,,We understand your relationship with @segalink...,1,0,0,0
1,2022-06-25 23:57:29+00:00,https://twitter.com/AsHiiR_/status/15408466728...,AsHiiR_,Twitter for iPhone,All over Nigeria,If you're an #OBIdient in Niger state hit me u...,0,0,0,0
2,2022-06-25 23:55:55+00:00,https://twitter.com/ObiSupport/status/15408462...,ObiSupport,Twitter for iPhone,Nigeria,"2023: Voting #PeterObi, The New Narrative In N...",32,12,2,0
3,2022-06-25 23:51:26+00:00,https://twitter.com/cleoterria/status/15408451...,cleoterria,Twitter for Android,,tonight was another failed gas lighting attemp...,0,0,0,0
4,2022-06-25 23:50:59+00:00,https://twitter.com/humanfactorz/status/154084...,humanfactorz,Twitter for Android,🌎 🇺🇸,BUNDLE OF INTEGRITY #PeterObiForPresident2023 ...,0,0,0,0


In [5]:
#Location Count
df1["Location"].value_counts()[:60]

                                  8293
Nigeria                           3767
Lagos, Nigeria                    2949
Abuja, Nigeria                    1042
Federal Capital Territory, Nig     386
Lagos                              356
Port Harcourt, Nigeria             272
Enugu, Nigeria                     245
Abuja                              245
Earth                              193
Lagos, Nigeria.                    174
Lagos Nigeria                      166
Nigeria                            165
United States                      131
EARTH                              127
Owerri, Nigeria                    115
Lagos,Nigeria                      111
Africa                             108
Jos, Nigeria                       108
Ibadan, Nigeria                    106
Around the world                   103
London, England                    102
Uyo, Nigeria                        97
Lekki, Nigeria                      95
Kano, Nigeria                       93
Benin-City, Nigeria      

In [16]:
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [17]:
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

In [18]:
#applying the function
no_stopwords= lower.apply(lambda x:remove_stopwords(x))

In [30]:
#Load DataFrame
df = pd.read_csv("snscrape_election.csv",index = False)

In [31]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,TweetUrl,User,Source,Location,Tweet,Likes_Count,Retweet_Count,Quote_Count,Reply_Count
0,0,2022-06-25 23:58:50+00:00,https://twitter.com/Plato48466760/status/15408...,Plato48466760,Twitter for Android,,We understand your relationship with @segalink...,1,0,0,0
1,1,2022-06-25 23:57:29+00:00,https://twitter.com/AsHiiR_/status/15408466728...,AsHiiR_,Twitter for iPhone,All over Nigeria,If you're an #OBIdient in Niger state hit me u...,0,0,0,0
2,2,2022-06-25 23:55:55+00:00,https://twitter.com/ObiSupport/status/15408462...,ObiSupport,Twitter for iPhone,Nigeria,"2023: Voting #PeterObi, The New Narrative In N...",32,12,2,0
3,3,2022-06-25 23:51:26+00:00,https://twitter.com/cleoterria/status/15408451...,cleoterria,Twitter for Android,,tonight was another failed gas lighting attemp...,0,0,0,0
4,4,2022-06-25 23:50:59+00:00,https://twitter.com/humanfactorz/status/154084...,humanfactorz,Twitter for Android,🌎 🇺🇸,BUNDLE OF INTEGRITY #PeterObiForPresident2023 ...,0,0,0,0


In [32]:
#Drop duplicate
df= df.drop_duplicates("Tweet", keep ="first")
df

Unnamed: 0.1,Unnamed: 0,Date,TweetUrl,User,Source,Location,Tweet,Likes_Count,Retweet_Count,Quote_Count,Reply_Count
0,0,2022-06-25 23:58:50+00:00,https://twitter.com/Plato48466760/status/15408...,Plato48466760,Twitter for Android,,We understand your relationship with @segalink...,1,0,0,0
1,1,2022-06-25 23:57:29+00:00,https://twitter.com/AsHiiR_/status/15408466728...,AsHiiR_,Twitter for iPhone,All over Nigeria,If you're an #OBIdient in Niger state hit me u...,0,0,0,0
2,2,2022-06-25 23:55:55+00:00,https://twitter.com/ObiSupport/status/15408462...,ObiSupport,Twitter for iPhone,Nigeria,"2023: Voting #PeterObi, The New Narrative In N...",32,12,2,0
3,3,2022-06-25 23:51:26+00:00,https://twitter.com/cleoterria/status/15408451...,cleoterria,Twitter for Android,,tonight was another failed gas lighting attemp...,0,0,0,0
4,4,2022-06-25 23:50:59+00:00,https://twitter.com/humanfactorz/status/154084...,humanfactorz,Twitter for Android,🌎 🇺🇸,BUNDLE OF INTEGRITY #PeterObiForPresident2023 ...,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
29994,29994,2022-06-14 14:18:56+00:00,https://twitter.com/acmelegend/status/15367148...,acmelegend,Twitter for Android,NIGERIA,@PeterObi I stand with #PeterObi,0,0,0,0
29995,29995,2022-06-14 14:18:28+00:00,https://twitter.com/onyiforlife/status/1536714...,onyiforlife,Twitter for Android,"Benin-City, Nigeria",@PeterObi My president departed for Egypt 🇪🇬 S...,4,2,0,0
29997,29997,2022-06-14 14:18:02+00:00,https://twitter.com/RunMila1/status/1536714585...,RunMila1,Twitter for iPhone,,@_SirWilliam_ @PeterObi Thank you William!!! #...,0,0,0,0
29998,29998,2022-06-14 14:17:37+00:00,https://twitter.com/Kaynuel_/status/1536714480...,Kaynuel_,Twitter for Android,,@PeterObi Obidient to the core! #PeterObi #Pet...,0,0,0,0


In [33]:
#Load tweet column into a new DataFrame
df1 = df["Tweet"]

In [39]:
#Load to a CSV file
df1.to_csv("tweet.csv", index = False)
df1 = pd.read_csv("tweet.csv")
df1.head()

Unnamed: 0,Tweet
0,We understand your relationship with @segalink...
1,If you're an #OBIdient in Niger state hit me u...
2,"2023: Voting #PeterObi, The New Narrative In N..."
3,tonight was another failed gas lighting attemp...
4,BUNDLE OF INTEGRITY #PeterObiForPresident2023 ...


In [47]:
df1.head()

Unnamed: 0,Tweet
0,We understand your relationship with @segalink...
1,If you're an #OBIdient in Niger state hit me u...
2,"2023: Voting #PeterObi, The New Narrative In N..."
3,tonight was another failed gas lighting attemp...
4,BUNDLE OF INTEGRITY #PeterObiForPresident2023 ...


#### Remove punctuation mark

In [49]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
puntuation_free= df1["Tweet"].apply(lambda x:remove_punctuation(x))
puntuation_free.head()

0    We understand your relationship with segalink ...
1    If youre an OBIdient in Niger state hit me up ...
2    2023 Voting PeterObi The New Narrative In Nort...
3    tonight was another failed gas lighting attemp...
4    BUNDLE OF INTEGRITY PeterObiForPresident2023 P...
Name: Tweet, dtype: object

#### Convert to Lowercase

In [50]:
#Lowercasing all the letters
lower = puntuation_free.apply(lambda x: x.lower())
lower.head()

0    we understand your relationship with segalink ...
1    if youre an obidient in niger state hit me up ...
2    2023 voting peterobi the new narrative in nort...
3    tonight was another failed gas lighting attemp...
4    bundle of integrity peterobiforpresident2023 p...
Name: Tweet, dtype: object

In [54]:
df1["cleaned_tweet"] = lower

In [55]:
df1

Unnamed: 0,Tweet,cleaned_tweet
0,We understand your relationship with @segalink...,we understand your relationship with segalink ...
1,If you're an #OBIdient in Niger state hit me u...,if youre an obidient in niger state hit me up ...
2,"2023: Voting #PeterObi, The New Narrative In N...",2023 voting peterobi the new narrative in nort...
3,tonight was another failed gas lighting attemp...,tonight was another failed gas lighting attemp...
4,BUNDLE OF INTEGRITY #PeterObiForPresident2023 ...,bundle of integrity peterobiforpresident2023 p...
...,...,...
29799,@PeterObi I stand with #PeterObi,peterobi i stand with peterobi
29800,@PeterObi My president departed for Egypt 🇪🇬 S...,peterobi my president departed for egypt 🇪🇬 sa...
29801,@_SirWilliam_ @PeterObi Thank you William!!! #...,sirwilliam peterobi thank you william peterobi...
29802,@PeterObi Obidient to the core! #PeterObi #Pet...,peterobi obidient to the core peterobi peterob...


#### Replace Emoji with text
Load pickle file and replace emoji with text

In [90]:
import pickle
with open("emoji_dict.pkl","rb") as f:
    Emoji_dict = pickle.load(f)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

In [None]:
df1["cleaned_tweet"]= data[column].progress_apply(lambda text: convert_emoticons(str(text)))

In [None]:
df1.head()

In [None]:
for tweet in df1["cleaned_tweet"]:
    print(tweet)