# Exploratory Data Analysis of Climate Change Tweets

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

### Read data set

In [2]:
## added encoding because data also has emojis
tweets = pd.read_csv('./data/climate_change_tweets.csv', encoding="utf-8");

In [3]:
## Lets look at the data
tweets.head()

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
0,Lauren Boebert,@laurenboebert,2022-01-17T23:32:38.000Z,Lauren Boebert\n@laurenboebert\n·\nJan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11.7K,[],https://twitter.com/laurenboebert/status/14832...
1,Catherine,@catherine___c,2022-01-17T22:54:02.000Z,Catherine\n@catherine___c\n·\nJan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...
2,king Keith,@KaConfessor,2022-01-17T23:51:41.000Z,king Keith\n@KaConfessor\n·\nJan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17T21:42:04.000Z,PETRIFIED CLIMATE PARENT\n@climate_parent\n·\n...,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...
4,Thomas Speight,@Thomas_Sp8,2022-01-17T21:10:40.000Z,Thomas Speight\n@Thomas_Sp8\n·\nJan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...


In [4]:
## Look at the tail of the data
tweets.tail()

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
9045,Dr Srijana Mitra Das,@srijanapiya17,2022-07-18T12:08:28.000Z,Dr Srijana Mitra Das\n@srijanapiya17\n·\nJul 18,#ClimateChange is now the greatest story on Ea...,,2.0,16.0,24.0,['https://pbs.twimg.com/profile_images/5140754...,https://twitter.com/srijanapiya17/status/15490...
9046,1%_Better_Every_Day,@jh336405,2022-07-18T00:33:20.000Z,1%_Better_Every_Day\n@jh336405\n·\nJul 18,Replying to \n@jh336405\n @acuna_r\n and 41 ot...,💯 💯 🌏,4.0,,,['https://pbs.twimg.com/profile_images/1442412...,https://twitter.com/jh336405/status/1548828230...
9047,David Schechter,@DavidSchechter,2022-07-18T21:13:13.000Z,David Schechter\n@DavidSchechter\n·\nJul 18,While Texans are being asked to use less elect...,,3.0,14.0,23.0,['https://pbs.twimg.com/card_img/1549138950475...,https://twitter.com/DavidSchechter/status/1549...
9048,Daily Climate,@TheDailyClimate,2022-07-18T10:15:09.000Z,Daily Climate\n@TheDailyClimate\n·\nJul 18,"Sea levels are rising, and communities are scr...",,,3.0,,['https://pbs.twimg.com/card_img/1547862999808...,https://twitter.com/TheDailyClimate/status/154...
9049,Robot Enthusiast,@robothusiast,2022-07-18T16:32:53.000Z,Robot Enthusiast\n@robothusiast\n·\nJul 18,UTS creates algae-analysing robot to combat cl...,,,,,['https://pbs.twimg.com/card_img/1549069752219...,https://twitter.com/robothusiast/status/154906...


In [5]:
## Lets look at the shape of the data
tweets.shape

(9050, 11)

In [6]:
## Lets look at the columns of the data
tweets.columns

Index(['UserScreenName', 'UserName', 'Timestamp', 'Text', 'Embedded_text',
       'Emojis', 'Comments', 'Likes', 'Retweets', 'Image link', 'Tweet URL'],
      dtype='object')

In [7]:
## Lets look at the data types of the columns
# read character column

pd.DataFrame(tweets.dtypes, columns=['DataType'])

Unnamed: 0,DataType
UserScreenName,object
UserName,object
Timestamp,object
Text,object
Embedded_text,object
Emojis,object
Comments,object
Likes,object
Retweets,object
Image link,object


So there are 11 columns all of them are of type object.  That means all columns might have some missing information, e.g. some `NaN`s.  Next step would be to check that out. 

In [8]:

tweets.isnull().mean()

UserScreenName    0.001436
UserName          0.000000
Timestamp         0.000000
Text              0.000000
Embedded_text     0.000000
Emojis            0.776133
Comments          0.306298
Likes             0.068398
Retweets          0.019116
Image link        0.000000
Tweet URL         0.000000
dtype: float64

In [9]:
## From the above we can see that ~70% of data do not have emojis which is a good indicator of the data quality
## Around 30% do not have any comments, this could be the case  where twitter accound do not have follower and not necessarily the case of the data quality
## Screen names are missing for < 1% of the data, we can ignore that. 

### Following Data Clean Up is reaquired

## Aroudn 1% of retweets and 6% of likes do not have any data. Two ways to fix that, 
     # We can fix the data by setting it to zero
     # By setting it to average retweet/likes that user gets. 
## Convert Timestamp to datetime.
## Convert NaNs to empty strings for the columns we are interested in.
## We should also convert columns names to lowercase and rename few columns for readability

In [10]:
## Lets convert the columns to lowercase
tweets.columns = tweets.columns.str.lower()
tweets.head()

Unnamed: 0,userscreenname,username,timestamp,text,embedded_text,emojis,comments,likes,retweets,image link,tweet url
0,Lauren Boebert,@laurenboebert,2022-01-17T23:32:38.000Z,Lauren Boebert\n@laurenboebert\n·\nJan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11.7K,[],https://twitter.com/laurenboebert/status/14832...
1,Catherine,@catherine___c,2022-01-17T22:54:02.000Z,Catherine\n@catherine___c\n·\nJan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...
2,king Keith,@KaConfessor,2022-01-17T23:51:41.000Z,king Keith\n@KaConfessor\n·\nJan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17T21:42:04.000Z,PETRIFIED CLIMATE PARENT\n@climate_parent\n·\n...,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...
4,Thomas Speight,@Thomas_Sp8,2022-01-17T21:10:40.000Z,Thomas Speight\n@Thomas_Sp8\n·\nJan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...


In [11]:
## we'll also rename userscreenname to screen_name and username to user_name for readability
tweets.rename(columns={'userscreenname':'screen_name', 'username':'user_name'}, inplace=True)
tweets.head()

Unnamed: 0,screen_name,user_name,timestamp,text,embedded_text,emojis,comments,likes,retweets,image link,tweet url
0,Lauren Boebert,@laurenboebert,2022-01-17T23:32:38.000Z,Lauren Boebert\n@laurenboebert\n·\nJan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11.7K,[],https://twitter.com/laurenboebert/status/14832...
1,Catherine,@catherine___c,2022-01-17T22:54:02.000Z,Catherine\n@catherine___c\n·\nJan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...
2,king Keith,@KaConfessor,2022-01-17T23:51:41.000Z,king Keith\n@KaConfessor\n·\nJan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17T21:42:04.000Z,PETRIFIED CLIMATE PARENT\n@climate_parent\n·\n...,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...
4,Thomas Speight,@Thomas_Sp8,2022-01-17T21:10:40.000Z,Thomas Speight\n@Thomas_Sp8\n·\nJan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...


In [12]:
## convert timestamp to datetime object
tweets["timestamp"] = pd.to_datetime(tweets["timestamp"])
print(tweets.dtypes)
tweets.head()

screen_name                   object
user_name                     object
timestamp        datetime64[ns, UTC]
text                          object
embedded_text                 object
emojis                        object
comments                      object
likes                         object
retweets                      object
image link                    object
tweet url                     object
dtype: object


Unnamed: 0,screen_name,user_name,timestamp,text,embedded_text,emojis,comments,likes,retweets,image link,tweet url
0,Lauren Boebert,@laurenboebert,2022-01-17 23:32:38+00:00,Lauren Boebert\n@laurenboebert\n·\nJan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11.7K,[],https://twitter.com/laurenboebert/status/14832...
1,Catherine,@catherine___c,2022-01-17 22:54:02+00:00,Catherine\n@catherine___c\n·\nJan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...
2,king Keith,@KaConfessor,2022-01-17 23:51:41+00:00,king Keith\n@KaConfessor\n·\nJan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17 21:42:04+00:00,PETRIFIED CLIMATE PARENT\n@climate_parent\n·\n...,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...
4,Thomas Speight,@Thomas_Sp8,2022-01-17 21:10:40+00:00,Thomas Speight\n@Thomas_Sp8\n·\nJan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...


In [13]:
## one thing we missed earlier was comments, likes and retweet counts are not numeric. 
# The reason could be they have `NaNs` and they might have `K` in the end and they have commas in them.
## we'll need to find out how many records end with K. 

In [14]:
## Having NaNs in retweets is causing error so filter out those records
## total records
print("number of rows : ", tweets.shape[0])

## number of records that end with K
retweet_ends_with_k = ~pd.isna(tweets["retweets"]) & tweets["retweets"].str.endswith("K")
print("number of retweet values ending with K : ", tweets[retweet_ends_with_k].shape[0])


## number of records that are numeric
retweet_numeric_condition = ~pd.isna(tweets["retweets"]) & tweets["retweets"].str.isnumeric()
print("number of retweet values which are numeric : ",tweets[retweet_numeric_condition].shape[0])


## number of nans
retweet_nans_condition = pd.isna(tweets["retweets"])
print("number of null retweets : ",tweets[retweet_nans_condition].shape[0])


## some records are still missing why is that :thinking:
print("number of numeric retweet values with comma : ", tweets[~retweet_ends_with_k & ~retweet_numeric_condition & ~retweet_nans_condition].shape[0])
## These are regular retweets :duh:


number of rows :  9050
number of retweet values ending with K :  48
number of retweet values which are numeric :  8553
number of null retweets :  173
number of numeric retweet values with comma :  276


In [15]:
## Lets fix the data one at a time
### lets convert regular numbers to int
### Step 1 : remove commas from values
tweets.loc[~retweet_ends_with_k & ~retweet_numeric_condition & ~retweet_nans_condition, "retweets"] = tweets.loc[~retweet_ends_with_k & ~retweet_numeric_condition & ~retweet_nans_condition, "retweets"].str.replace(",", "")

### Step 2 : Convert data to int.
tweets.loc[~retweet_ends_with_k & ~retweet_numeric_condition & ~retweet_nans_condition, "retweets"] = tweets.loc[~retweet_ends_with_k & ~retweet_numeric_condition & ~retweet_nans_condition, "retweets"].astype(int)

In [16]:
## Lets convert numbers ending with K to int

### Step 1: remove K from values
### Step 2 : convert then numbers to int
### Step 3 : multiply by 1000
### We do all this in one step because after removing K we will not be able to filter these rows
tweets.loc[retweet_ends_with_k, "retweets"] = tweets.loc[retweet_ends_with_k, "retweets"].str.replace("K", "").astype(float) * 1000



In [17]:
## Now lets fix the missing values. 
## we will fill retweets by checking if we can get mean retweet for that userID if not set it to 0
def fill_retweet_mean(row):
     retweet_mean = tweets[tweets["user_name"] == row["user_name"]]["retweets"].astype(float).mean()
     row["retweets"] = retweet_mean if pd.isna(retweet_mean) == False else 0
     return row

tweets[retweet_nans_condition] = tweets[retweet_nans_condition].apply(fill_retweet_mean, axis="columns")


In [18]:
### Lets check how our retweets
tweets.isnull().mean()
#### Yay 0 null retweets!

## Lets round down and convert number into int for consistancy
tweets["retweets"] = round(tweets["retweets"].astype(float)).astype(int)

In [19]:
## Lets confirm dtype
tweets.dtypes

## yay tweets are of int64 :woot: :woot:

screen_name                   object
user_name                     object
timestamp        datetime64[ns, UTC]
text                          object
embedded_text                 object
emojis                        object
comments                      object
likes                         object
retweets                       int64
image link                    object
tweet url                     object
dtype: object

In [20]:
## We'll have to repeate the same steps for `comments` and `likes` as well since these are numeric fields as well. 
## Having NaNs in comments is causing error so filter out those records
## total records
print("number of rows : ", tweets.shape[0])

## number of records that end with K
comments_ends_with_k = ~pd.isna(tweets["comments"]) & tweets["comments"].str.endswith("K")
print("number of comments values ending with K : ", tweets[comments_ends_with_k].shape[0])

## number of records that are numeric
comments_numeric_condition = ~pd.isna(tweets["comments"]) & tweets["comments"].str.isnumeric()
print("number of comments values which are numeric : ",tweets[comments_numeric_condition].shape[0])

## number of nans
comments_nans_condition = pd.isna(tweets["comments"])
print("number of null comments : ",tweets[comments_nans_condition].shape[0])

## some records are still missing why is that :thinking:
print("number of numeric comment values with comma : ", tweets[~comments_ends_with_k & ~comments_numeric_condition & ~comments_nans_condition].shape[0])
## These are regular comments :duh:


number of rows :  9050
number of comments values ending with K :  0
number of comments values which are numeric :  6249
number of null comments :  2772
number of numeric comment values with comma :  29


In [21]:
## Lets fix the data one at a time
### lets convert regular numbers to int
### Step 1 : remove commas from values
tweets.loc[~comments_ends_with_k & ~comments_numeric_condition & ~comments_nans_condition, "comments"] = tweets.loc[~comments_ends_with_k & ~comments_numeric_condition & ~comments_nans_condition, "comments"].str.replace(",", "")

### Step 2 : Convert data to int.
tweets.loc[~comments_ends_with_k & ~comments_numeric_condition & ~comments_nans_condition, "comments"] = tweets.loc[~comments_ends_with_k & ~comments_numeric_condition & ~comments_nans_condition, "comments"].astype(int)

In [22]:
## Now lets fix the missing values. 
## we will fill retweets by checking if we can get mean retweet for that userID if not set it to 0
def fill_comments_mean(row):
     comments_mean = tweets[tweets["user_name"] == row["user_name"]]["comments"].astype(float).mean()
     row["comments"] = comments_mean if pd.isna(comments_mean) == False else 0
     return row

tweets[comments_nans_condition] = tweets[comments_nans_condition].apply(fill_comments_mean, axis="columns")


In [25]:
### Lets check how our retweets
tweets.isnull().mean()
#### Yay 0 null retweets!

## Lets round down and convert number into int for consistancy
tweets["comments"] = round(tweets["comments"].astype(float)).astype(int)
tweets.dtypes

screen_name                   object
user_name                     object
timestamp        datetime64[ns, UTC]
text                          object
embedded_text                 object
emojis                        object
comments                       int64
likes                         object
retweets                       int64
image link                    object
tweet url                     object
dtype: object

In [27]:
## Having NaNs in likes is causing error so filter out those records
## total records
print("number of rows : ", tweets.shape[0])

## number of records that end with K
likes_ends_with_k = ~pd.isna(tweets["likes"]) & tweets["likes"].str.endswith("K")
print("number of likes values ending with K : ", tweets[likes_ends_with_k].shape[0])

## number of records that are numeric
likes_numeric_condition = ~pd.isna(tweets["likes"]) & tweets["likes"].str.isnumeric()
print("number of likes values which are numeric : ",tweets[likes_numeric_condition].shape[0])

## number of nans
likes_nans_condition = pd.isna(tweets["likes"])
print("number of null likes : ",tweets[likes_nans_condition].shape[0])

## some records are still missing why is that :thinking:
print("number of numeric likes values with comma : ", tweets[~likes_ends_with_k & ~likes_numeric_condition & ~likes_nans_condition].shape[0])
## These are regular comments :duh:


number of rows :  9050
number of likes values ending with K :  11
number of likes values which are numeric :  8343
number of null likes :  619
number of numeric likes values with comma :  77


In [28]:
## Lets fix the data one at a time
### lets convert regular numbers to int
### Step 1 : remove commas from values
tweets.loc[~likes_ends_with_k & ~likes_numeric_condition & ~likes_nans_condition, "likes"] = tweets.loc[~likes_ends_with_k & ~likes_numeric_condition & ~likes_nans_condition, "likes"].str.replace(",", "")

### Step 2 : Convert data to int.
tweets.loc[~likes_ends_with_k & ~likes_numeric_condition & ~likes_nans_condition, "likes"] = tweets.loc[~likes_ends_with_k & ~likes_numeric_condition & ~likes_nans_condition, "likes"].astype(int)

In [29]:
## Lets convert numbers ending with K to int

### Step 1: remove K from values
### Step 2 : convert then numbers to int
### Step 3 : multiply by 1000
### We do all this in one step because after removing K we will not be able to filter these rows
tweets.loc[likes_ends_with_k, "likes"] = tweets.loc[likes_ends_with_k, "likes"].str.replace("K", "").astype(float) * 1000



In [30]:
## Now lets fix the missing values. 
## we will fill retweets by checking if we can get mean retweet for that userID if not set it to 0
def fill_likes_mean(row):
     likes_mean = tweets[tweets["user_name"] == row["user_name"]]["likes"].astype(float).mean()
     row["likes"] = likes_mean if pd.isna(likes_mean) == False else 0
     return row

tweets[likes_nans_condition] = tweets[likes_nans_condition].apply(fill_likes_mean, axis="columns")

In [34]:
### Lets check how our retweets
print(tweets.isnull().mean())
#### Yay 0 null retweets!

## Lets round down and convert number into int for consistancy
tweets["likes"] = round(tweets["likes"].astype(float)).astype(int)
print(tweets.dtypes)

screen_name      0.001436
user_name        0.000000
timestamp        0.000000
text             0.000000
embedded_text    0.000000
emojis           0.776133
comments         0.000000
likes            0.000000
retweets         0.000000
image link       0.000000
tweet url        0.000000
dtype: float64
screen_name                   object
user_name                     object
timestamp        datetime64[ns, UTC]
text                          object
embedded_text                 object
emojis                        object
comments                       int64
likes                          int64
retweets                       int64
image link                    object
tweet url                     object
dtype: object


In [37]:
## Emojis have ~77% of null values, which means not all tweets use emojis. For now it feels safe to just fill it with empty strings
tweets["emojis"].fillna("", inplace=True)

In [38]:
tweets.isnull().mean()

screen_name      0.001436
user_name        0.000000
timestamp        0.000000
text             0.000000
embedded_text    0.000000
emojis           0.000000
comments         0.000000
likes            0.000000
retweets         0.000000
image link       0.000000
tweet url        0.000000
dtype: float64