# Data exploration and pre-processing

## Imports 

In [1]:
import re
import pandas as pd

## Data reading 

In [22]:
tweets = pd.read_csv("../processed_tweet_data.csv")
tweets

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Fri Jun 18 17:55:49 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...","RT @TelGlobalHealth: 🚨Africa is ""in the midst ...",0.000000,0.000000,en,,,ketuesriche,551,351,,[],"[{'screen_name': 'TelGlobalHealth', 'name': 'T...",Mass
1,Fri Jun 18 17:55:59 +0000 2021,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @globalhlthtwit: Dr Moeti is head of WHO in...,0.133333,0.455556,en,,,Grid1949,66,92,,[],"[{'screen_name': 'globalhlthtwit', 'name': 'An...","Edinburgh, Scotland"
2,Fri Jun 18 17:56:07 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...",RT @NHSRDForum: Thank you @research2note for c...,0.316667,0.483333,en,,,LeeTomlinson8,1195,1176,,"[{'text': 'red4research', 'indices': [103, 116]}]","[{'screen_name': 'NHSRDForum', 'name': 'NHS R&...",
3,Fri Jun 18 17:56:10 +0000 2021,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @HighWireTalk: Former Pfizer VP and Virolog...,0.166667,0.166667,en,,,RIPNY08,2666,2704,,[],"[{'screen_name': 'HighWireTalk', 'name': 'The ...",
4,Fri Jun 18 17:56:20 +0000 2021,"<a href=""http://twitter.com/download/android"" ...",RT @PeterHotez: I think it’s important that we...,0.300000,0.766667,en,,,pash22,28250,30819,,[],"[{'screen_name': 'PeterHotez', 'name': 'Prof P...",United Kingdom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6527,Sat Jun 19 07:41:15 +0000 2021,"<a href=""http://twitter.com/download/android"" ...",RT @Givenkazeni: Zweli please just release the...,0.000000,0.400000,en,,,Mthatos_Vivi,447,1089,,[],"[{'screen_name': 'Givenkazeni', 'name': 'le’Gi...",
6528,Sat Jun 19 07:41:26 +0000 2021,"<a href=""http://twitter.com/download/android"" ...",RT @HighWireTalk: Former Pfizer VP and Virolog...,0.166667,0.166667,en,,,wayno_af007,2224,2739,,[],"[{'screen_name': 'HighWireTalk', 'name': 'The ...","The boro, MA"
6529,Sat Jun 19 07:41:31 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...","@Jenfeds73 @DcrInYYC Respectfully, veterinaria...",0.281250,0.506250,en,,,dublonothing,3000,4709,,[],"[{'screen_name': 'Jenfeds73', 'name': 'Bubs 🇨🇦...","Los Angeles, CA"
6530,Sat Jun 19 07:41:45 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...","RT @WHOAFRO: ""Africa needs millions more doses...",0.166667,0.166667,en,,,DrAmirKhanGP,135163,1284,,"[{'text': 'COVID19', 'indices': [120, 128]}]","[{'screen_name': 'WHOAFRO', 'name': 'WHO Afric...",Yorkshire and The Humber


## Pre-processing

To get information abou the data

In [23]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6532 entries, 0 to 6531
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   created_at          6532 non-null   object 
 1   source              6532 non-null   object 
 2   original_text       6532 non-null   object 
 3   polarity            6532 non-null   float64
 4   subjectivity        6532 non-null   float64
 5   lang                6532 non-null   object 
 6   favorite_count      0 non-null      float64
 7   retweet_count       0 non-null      float64
 8   original_author     6532 non-null   object 
 9   followers_count     6532 non-null   int64  
 10  friends_count       6532 non-null   int64  
 11  possibly_sensitive  0 non-null      float64
 12  hashtags            6532 non-null   object 
 13  user_mentions       6532 non-null   object 
 14  place               4088 non-null   object 
dtypes: float64(5), int64(2), object(8)
memory usage: 765.6+

missing values

In [24]:
print("The number of missing value(s): {}".format(tweets.isnull().sum().sum()))
print("Columons having columns value: {}".format(tweets.columns[tweets.isnull().any()]))

The number of missing value(s): 22040
Columons having columns value: Index(['favorite_count', 'retweet_count', 'possibly_sensitive', 'place'], dtype='object')


### Clean original_text


In [None]:
def clean_tweet(tweet):
    clean_tweet = re.sub("[^a-zA-Z]",  " ",  tweet)
    return clean_tweet


tweets["original_text"] = tweets.original_text.apply(clean_tweet)

### Convert tweet sentiment to category

In [29]:
def text_category(p):
  if p > 0:
    return "positive"
  elif p < 0:
    return "negative"
  else:
    return "neutral"

In [21]:
tweets["polarity"] = tweets["polarity"].apply(text_category)

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place


## Data exploration 

### Tweet languages

In [25]:
tweet_lang = tweets.groupby(['lang']).size()
tweet_lang

lang
en    6532
dtype: int64

### Tweet sentiments

In [27]:
tweet_sent = tweets.groupby(['polarity']).size()
tweet_sent

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Fri Jun 18 17:55:49 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...","RT @TelGlobalHealth: 🚨Africa is ""in the midst ...",neutral,0.000000,en,,,ketuesriche,551,351,,[],"[{'screen_name': 'TelGlobalHealth', 'name': 'T...",Mass
1,Fri Jun 18 17:55:59 +0000 2021,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @globalhlthtwit: Dr Moeti is head of WHO in...,positive,0.455556,en,,,Grid1949,66,92,,[],"[{'screen_name': 'globalhlthtwit', 'name': 'An...","Edinburgh, Scotland"
2,Fri Jun 18 17:56:07 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...",RT @NHSRDForum: Thank you @research2note for c...,positive,0.483333,en,,,LeeTomlinson8,1195,1176,,"[{'text': 'red4research', 'indices': [103, 116]}]","[{'screen_name': 'NHSRDForum', 'name': 'NHS R&...",
3,Fri Jun 18 17:56:10 +0000 2021,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @HighWireTalk: Former Pfizer VP and Virolog...,positive,0.166667,en,,,RIPNY08,2666,2704,,[],"[{'screen_name': 'HighWireTalk', 'name': 'The ...",
4,Fri Jun 18 17:56:20 +0000 2021,"<a href=""http://twitter.com/download/android"" ...",RT @PeterHotez: I think it’s important that we...,positive,0.766667,en,,,pash22,28250,30819,,[],"[{'screen_name': 'PeterHotez', 'name': 'Prof P...",United Kingdom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6527,Sat Jun 19 07:41:15 +0000 2021,"<a href=""http://twitter.com/download/android"" ...",RT @Givenkazeni: Zweli please just release the...,neutral,0.400000,en,,,Mthatos_Vivi,447,1089,,[],"[{'screen_name': 'Givenkazeni', 'name': 'le’Gi...",
6528,Sat Jun 19 07:41:26 +0000 2021,"<a href=""http://twitter.com/download/android"" ...",RT @HighWireTalk: Former Pfizer VP and Virolog...,positive,0.166667,en,,,wayno_af007,2224,2739,,[],"[{'screen_name': 'HighWireTalk', 'name': 'The ...","The boro, MA"
6529,Sat Jun 19 07:41:31 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...","@Jenfeds73 @DcrInYYC Respectfully, veterinaria...",positive,0.506250,en,,,dublonothing,3000,4709,,[],"[{'screen_name': 'Jenfeds73', 'name': 'Bubs 🇨🇦...","Los Angeles, CA"
6530,Sat Jun 19 07:41:45 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...","RT @WHOAFRO: ""Africa needs millions more doses...",positive,0.166667,en,,,DrAmirKhanGP,135163,1284,,"[{'text': 'COVID19', 'indices': [120, 128]}]","[{'screen_name': 'WHOAFRO', 'name': 'WHO Afric...",Yorkshire and The Humber


polarity
negative    1216
neutral     2508
positive    2808
dtype: int64

## Visualisations

## Save the Data