# Project 8 - Exploratory Data Analysis of Climate Change Tweets

### Install required libraries

In [85]:
# ! pip install pandas
# ! pip install numpy

### Import required libraries

In [86]:
import pandas as pd
import numpy as np
import plotly.express as px


### Read data set

In [87]:
## added encoding because data also has emojis
tweets = pd.read_csv('./data/climate_change_tweets.csv', encoding="utf-8");

#### Step 1 : Lets understand the data

In [88]:
## Lets look at the data
tweets.head()

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
0,Lauren Boebert,@laurenboebert,2022-01-17T23:32:38.000Z,Lauren Boebert\n@laurenboebert\n·\nJan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11.7K,[],https://twitter.com/laurenboebert/status/14832...
1,Catherine,@catherine___c,2022-01-17T22:54:02.000Z,Catherine\n@catherine___c\n·\nJan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...
2,king Keith,@KaConfessor,2022-01-17T23:51:41.000Z,king Keith\n@KaConfessor\n·\nJan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17T21:42:04.000Z,PETRIFIED CLIMATE PARENT\n@climate_parent\n·\n...,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...
4,Thomas Speight,@Thomas_Sp8,2022-01-17T21:10:40.000Z,Thomas Speight\n@Thomas_Sp8\n·\nJan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...


In [89]:
## Look at the tail of the data
tweets.tail()

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
9045,Dr Srijana Mitra Das,@srijanapiya17,2022-07-18T12:08:28.000Z,Dr Srijana Mitra Das\n@srijanapiya17\n·\nJul 18,#ClimateChange is now the greatest story on Ea...,,2.0,16.0,24.0,['https://pbs.twimg.com/profile_images/5140754...,https://twitter.com/srijanapiya17/status/15490...
9046,1%_Better_Every_Day,@jh336405,2022-07-18T00:33:20.000Z,1%_Better_Every_Day\n@jh336405\n·\nJul 18,Replying to \n@jh336405\n @acuna_r\n and 41 ot...,💯 💯 🌏,4.0,,,['https://pbs.twimg.com/profile_images/1442412...,https://twitter.com/jh336405/status/1548828230...
9047,David Schechter,@DavidSchechter,2022-07-18T21:13:13.000Z,David Schechter\n@DavidSchechter\n·\nJul 18,While Texans are being asked to use less elect...,,3.0,14.0,23.0,['https://pbs.twimg.com/card_img/1549138950475...,https://twitter.com/DavidSchechter/status/1549...
9048,Daily Climate,@TheDailyClimate,2022-07-18T10:15:09.000Z,Daily Climate\n@TheDailyClimate\n·\nJul 18,"Sea levels are rising, and communities are scr...",,,3.0,,['https://pbs.twimg.com/card_img/1547862999808...,https://twitter.com/TheDailyClimate/status/154...
9049,Robot Enthusiast,@robothusiast,2022-07-18T16:32:53.000Z,Robot Enthusiast\n@robothusiast\n·\nJul 18,UTS creates algae-analysing robot to combat cl...,,,,,['https://pbs.twimg.com/card_img/1549069752219...,https://twitter.com/robothusiast/status/154906...


In [90]:
## Lets look at the shape of the data
tweets.shape

(9050, 11)

In [91]:
## Lets look at the columns of the data
tweets.columns

Index(['UserScreenName', 'UserName', 'Timestamp', 'Text', 'Embedded_text',
       'Emojis', 'Comments', 'Likes', 'Retweets', 'Image link', 'Tweet URL'],
      dtype='object')

In [92]:
## Lets look at the data types of the columns
# read character column

pd.DataFrame(tweets.dtypes, columns=['DataType'])

Unnamed: 0,DataType
UserScreenName,object
UserName,object
Timestamp,object
Text,object
Embedded_text,object
Emojis,object
Comments,object
Likes,object
Retweets,object
Image link,object


##### Notes
* So there are 11 columns all of them are of type object.  
* That means some columns might have some missing information, e.g. some `NaN`s.  
* We might need to change column type for some columns, e.g. `Timestamp`, `Comments`, `Likes` etc. 

#### Step 2: Data Cleaning

In [93]:
tweets.isnull().mean()

UserScreenName    0.001436
UserName          0.000000
Timestamp         0.000000
Text              0.000000
Embedded_text     0.000000
Emojis            0.776133
Comments          0.306298
Likes             0.068398
Retweets          0.019116
Image link        0.000000
Tweet URL         0.000000
dtype: float64

##### Notes
* From the above we can see that ~70% of data do not have emojis, which doesn't necessarity mean missing data, people might be tweeting without emojis.
* Around 30% do not have any comments, this could be the case  where twitter accound do not have follower, or we have missing data. 
* Screen names are missing for < 1% of the data, we can ignore that for now.

##### Required Data Clean Up
* `Comments`, `Retweets` and `Likes` has missing data.  Two ways to fix that, 
     * Add missing data by setting it to 0.
     * Add missing data by finding user average.
* Convert `Timestamp` to `datetime`.
* Convert NaNs to empty strings for the columns we are interested in.
* We should also convert columns names to lowercase and rename few columns for readability.
* Remove `\n` and `\t` from text and embedded text



Convert column names to lower case

In [94]:
## convert the columns to lowercase
tweets.columns = tweets.columns.str.lower()
tweets.head()

Unnamed: 0,userscreenname,username,timestamp,text,embedded_text,emojis,comments,likes,retweets,image link,tweet url
0,Lauren Boebert,@laurenboebert,2022-01-17T23:32:38.000Z,Lauren Boebert\n@laurenboebert\n·\nJan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11.7K,[],https://twitter.com/laurenboebert/status/14832...
1,Catherine,@catherine___c,2022-01-17T22:54:02.000Z,Catherine\n@catherine___c\n·\nJan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...
2,king Keith,@KaConfessor,2022-01-17T23:51:41.000Z,king Keith\n@KaConfessor\n·\nJan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17T21:42:04.000Z,PETRIFIED CLIMATE PARENT\n@climate_parent\n·\n...,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...
4,Thomas Speight,@Thomas_Sp8,2022-01-17T21:10:40.000Z,Thomas Speight\n@Thomas_Sp8\n·\nJan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...


Rename columns for better readability

In [95]:
## rename userscreenname to screen_name and username to user_name for readability
tweets.rename(columns={'userscreenname':'screen_name', 'username':'user_name', 'image link': 'image_link', 'tweet url': 'tweet_url'}, inplace=True)
tweets.head()

Unnamed: 0,screen_name,user_name,timestamp,text,embedded_text,emojis,comments,likes,retweets,image_link,tweet_url
0,Lauren Boebert,@laurenboebert,2022-01-17T23:32:38.000Z,Lauren Boebert\n@laurenboebert\n·\nJan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11.7K,[],https://twitter.com/laurenboebert/status/14832...
1,Catherine,@catherine___c,2022-01-17T22:54:02.000Z,Catherine\n@catherine___c\n·\nJan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...
2,king Keith,@KaConfessor,2022-01-17T23:51:41.000Z,king Keith\n@KaConfessor\n·\nJan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17T21:42:04.000Z,PETRIFIED CLIMATE PARENT\n@climate_parent\n·\n...,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...
4,Thomas Speight,@Thomas_Sp8,2022-01-17T21:10:40.000Z,Thomas Speight\n@Thomas_Sp8\n·\nJan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...


Convert `timestamp` to `datetime`

In [96]:
## convert timestamp to datetime object
tweets["timestamp"] = pd.to_datetime(tweets["timestamp"])
print(tweets.dtypes)

screen_name                   object
user_name                     object
timestamp        datetime64[ns, UTC]
text                          object
embedded_text                 object
emojis                        object
comments                      object
likes                         object
retweets                      object
image_link                    object
tweet_url                     object
dtype: object


##### Notes
* One thing we missed earlier was comments, likes and retweet counts are not numeric. 
* The reason could be they have `NaNs` and they might have `K` in the end and they have commas in them. We'll need to find out how many records end with K. 

Lets check the amount and type of data in `retweets` column

In [97]:
## Having NaNs in retweets is causing error so filter out those records
## total records
print("number of rows : ", tweets.shape[0])

## number of records that end with K
retweet_ends_with_k = ~pd.isna(tweets["retweets"]) & tweets["retweets"].str.endswith("K")
print("number of retweet values ending with K : ", tweets[retweet_ends_with_k].shape[0])


## number of records that are numeric
retweet_numeric_condition = ~pd.isna(tweets["retweets"]) & tweets["retweets"].str.isnumeric()
print("number of retweet values which are numeric : ",tweets[retweet_numeric_condition].shape[0])


## number of nans
retweet_nans_condition = pd.isna(tweets["retweets"])
print("number of null retweets : ",tweets[retweet_nans_condition].shape[0])


## some records are still missing why is that :thinking:
print("number of numeric retweet values with comma : ", tweets[~retweet_ends_with_k & ~retweet_numeric_condition & ~retweet_nans_condition].shape[0])
## These are regular retweets :duh:


number of rows :  9050
number of retweet values ending with K :  48
number of retweet values which are numeric :  8553
number of null retweets :  173
number of numeric retweet values with comma :  276


Here we are, 
* Removing `,` from the values
* Converting values to `int`

In [98]:
## Lets fix the data one at a time
### lets convert regular numbers to int
### Step 1 : remove commas from values
tweets.loc[~retweet_ends_with_k & ~retweet_numeric_condition & ~retweet_nans_condition, "retweets"] = tweets.loc[~retweet_ends_with_k & ~retweet_numeric_condition & ~retweet_nans_condition, "retweets"].str.replace(",", "")

### Step 2 : Convert data to int.
tweets.loc[~retweet_ends_with_k & ~retweet_numeric_condition & ~retweet_nans_condition, "retweets"] = tweets.loc[~retweet_ends_with_k & ~retweet_numeric_condition & ~retweet_nans_condition, "retweets"].astype(int)

Here we are
* Removing `K` from the values
* Converting values to `float`
* Multiplying by 1000 to get correct absolute value

In [99]:
## Lets convert numbers ending with K to int

### Step 1: remove K from values
### Step 2 : convert then numbers to int
### Step 3 : multiply by 1000
### We do all this in one step because after removing K we will not be able to filter these rows
tweets.loc[retweet_ends_with_k, "retweets"] = tweets.loc[retweet_ends_with_k, "retweets"].str.replace("K", "").astype(float) * 1000



Finally we will, 
* Write a function to calculate average number of `retweets` for that user
* If average value is a valid number set missing value to the mean. 
* If average number is `NaN` (user might have tweeted just once) then set missing value to 0

In [100]:
## Now lets fix the missing values. 
## we will fill retweets by checking if we can get mean retweet for that userID if not set it to 0
def fill_retweet_mean(row):
     retweet_mean = tweets[tweets["user_name"] == row["user_name"]]["retweets"].astype(float).mean()
     row["retweets"] = retweet_mean if pd.isna(retweet_mean) == False else 0
     return row

tweets[retweet_nans_condition] = tweets[retweet_nans_condition].apply(fill_retweet_mean, axis="columns")


Here we are, 
* Validating our clean data.
* Rounding down the numbers and converting them to `int` for consistency. 

In [101]:
### Lets check how our retweets
tweets.isnull().mean()
#### Yay 0 null retweets!

## Lets round down and convert number into int for consistancy
tweets["retweets"] = round(tweets["retweets"].astype(float)).astype(int)

In [102]:
## Lets confirm dtype
tweets.dtypes

## yay tweets are of int64 :woot: :woot:

screen_name                   object
user_name                     object
timestamp        datetime64[ns, UTC]
text                          object
embedded_text                 object
emojis                        object
comments                      object
likes                         object
retweets                       int64
image_link                    object
tweet_url                     object
dtype: object

Next we'll repeate all the above steps for `comments` and `likes` columns as well. 

In [103]:
## We'll have to repeate the same steps for `comments` and `likes` as well since these are numeric fields as well. 
## Having NaNs in comments is causing error so filter out those records
## total records
print("number of rows : ", tweets.shape[0])

## number of records that end with K
comments_ends_with_k = ~pd.isna(tweets["comments"]) & tweets["comments"].str.endswith("K")
print("number of comments values ending with K : ", tweets[comments_ends_with_k].shape[0])

## number of records that are numeric
comments_numeric_condition = ~pd.isna(tweets["comments"]) & tweets["comments"].str.isnumeric()
print("number of comments values which are numeric : ",tweets[comments_numeric_condition].shape[0])

## number of nans
comments_nans_condition = pd.isna(tweets["comments"])
print("number of null comments : ",tweets[comments_nans_condition].shape[0])

## some records are still missing why is that :thinking:
print("number of numeric comment values with comma : ", tweets[~comments_ends_with_k & ~comments_numeric_condition & ~comments_nans_condition].shape[0])
## These are regular comments :duh:


number of rows :  9050
number of comments values ending with K :  0
number of comments values which are numeric :  6249
number of null comments :  2772
number of numeric comment values with comma :  29


In [104]:
## Lets fix the data one at a time
### lets convert regular numbers to int
### Step 1 : remove commas from values
tweets.loc[~comments_ends_with_k & ~comments_numeric_condition & ~comments_nans_condition, "comments"] = tweets.loc[~comments_ends_with_k & ~comments_numeric_condition & ~comments_nans_condition, "comments"].str.replace(",", "")

### Step 2 : Convert data to int.
tweets.loc[~comments_ends_with_k & ~comments_numeric_condition & ~comments_nans_condition, "comments"] = tweets.loc[~comments_ends_with_k & ~comments_numeric_condition & ~comments_nans_condition, "comments"].astype(int)

In [105]:
## Now lets fix the missing values. 
## we will fill retweets by checking if we can get mean retweet for that userID if not set it to 0
def fill_comments_mean(row):
     comments_mean = tweets[tweets["user_name"] == row["user_name"]]["comments"].astype(float).mean()
     row["comments"] = comments_mean if pd.isna(comments_mean) == False else 0
     return row

tweets[comments_nans_condition] = tweets[comments_nans_condition].apply(fill_comments_mean, axis="columns")


In [106]:
### Lets check how our retweets
tweets.isnull().mean()
#### Yay 0 null retweets!

## Lets round down and convert number into int for consistancy
tweets["comments"] = round(tweets["comments"].astype(float)).astype(int)
tweets.dtypes

screen_name                   object
user_name                     object
timestamp        datetime64[ns, UTC]
text                          object
embedded_text                 object
emojis                        object
comments                       int64
likes                         object
retweets                       int64
image_link                    object
tweet_url                     object
dtype: object

In [107]:
## Having NaNs in likes is causing error so filter out those records
## total records
print("number of rows : ", tweets.shape[0])

## number of records that end with K
likes_ends_with_k = ~pd.isna(tweets["likes"]) & tweets["likes"].str.endswith("K")
print("number of likes values ending with K : ", tweets[likes_ends_with_k].shape[0])

## number of records that are numeric
likes_numeric_condition = ~pd.isna(tweets["likes"]) & tweets["likes"].str.isnumeric()
print("number of likes values which are numeric : ",tweets[likes_numeric_condition].shape[0])

## number of nans
likes_nans_condition = pd.isna(tweets["likes"])
print("number of null likes : ",tweets[likes_nans_condition].shape[0])

## some records are still missing why is that :thinking:
print("number of numeric likes values with comma : ", tweets[~likes_ends_with_k & ~likes_numeric_condition & ~likes_nans_condition].shape[0])
## These are regular comments :duh:


number of rows :  9050
number of likes values ending with K :  11
number of likes values which are numeric :  8343
number of null likes :  619
number of numeric likes values with comma :  77


In [108]:
## Lets fix the data one at a time
### lets convert regular numbers to int
### Step 1 : remove commas from values
tweets.loc[~likes_ends_with_k & ~likes_numeric_condition & ~likes_nans_condition, "likes"] = tweets.loc[~likes_ends_with_k & ~likes_numeric_condition & ~likes_nans_condition, "likes"].str.replace(",", "")

### Step 2 : Convert data to int.
tweets.loc[~likes_ends_with_k & ~likes_numeric_condition & ~likes_nans_condition, "likes"] = tweets.loc[~likes_ends_with_k & ~likes_numeric_condition & ~likes_nans_condition, "likes"].astype(int)

In [109]:
## Lets convert numbers ending with K to int

### Step 1: remove K from values
### Step 2 : convert then numbers to int
### Step 3 : multiply by 1000
### We do all this in one step because after removing K we will not be able to filter these rows
tweets.loc[likes_ends_with_k, "likes"] = tweets.loc[likes_ends_with_k, "likes"].str.replace("K", "").astype(float) * 1000



In [110]:
## Now lets fix the missing values. 
## we will fill retweets by checking if we can get mean retweet for that userID if not set it to 0
def fill_likes_mean(row):
     likes_mean = tweets[tweets["user_name"] == row["user_name"]]["likes"].astype(float).mean()
     row["likes"] = likes_mean if pd.isna(likes_mean) == False else 0
     return row

tweets[likes_nans_condition] = tweets[likes_nans_condition].apply(fill_likes_mean, axis="columns")

In [111]:
### Lets check how our retweets
print(tweets.isnull().mean())
#### Yay 0 null retweets!

## Lets round down and convert number into int for consistancy
tweets["likes"] = round(tweets["likes"].astype(float)).astype(int)
print(tweets.dtypes)

screen_name      0.001436
user_name        0.000000
timestamp        0.000000
text             0.000000
embedded_text    0.000000
emojis           0.776133
comments         0.000000
likes            0.000000
retweets         0.000000
image_link       0.000000
tweet_url        0.000000
dtype: float64
screen_name                   object
user_name                     object
timestamp        datetime64[ns, UTC]
text                          object
embedded_text                 object
emojis                        object
comments                       int64
likes                          int64
retweets                       int64
image_link                    object
tweet_url                     object
dtype: object


Fill missing values in `emojis` column with empty string

In [112]:
## Emojis have ~77% of null values, which means not all tweets use emojis. For now it feels safe to just fill it with empty strings
tweets["emojis"].fillna("", inplace=True)

In [113]:
tweets.isnull().mean()

screen_name      0.001436
user_name        0.000000
timestamp        0.000000
text             0.000000
embedded_text    0.000000
emojis           0.000000
comments         0.000000
likes            0.000000
retweets         0.000000
image_link       0.000000
tweet_url        0.000000
dtype: float64

Change `text` and `embedded_text` columns to string

In [114]:
# change text and embedded text to string
tweets["text"] = tweets["text"].astype(str)

tweets["embedded_text"] = tweets["embedded_text"].astype(str)

print(tweets.dtypes)

screen_name                   object
user_name                     object
timestamp        datetime64[ns, UTC]
text                          object
embedded_text                 object
emojis                        object
comments                       int64
likes                          int64
retweets                       int64
image_link                    object
tweet_url                     object
dtype: object


Remove `\n` and `\t` characters from `text` and `embedded_text` columns

In [115]:
## Replace \n, \t in text and embedded text with empty string
tweets["text"] = tweets["text"].replace("\\n", " ", regex=True)
tweets["text"] = tweets["text"].replace("\\t", " ",regex=True)
tweets["embedded_text"] = tweets["embedded_text"].replace("\\n", " ", regex=True)
tweets["embedded_text"] = tweets["embedded_text"].replace("\\t", " ", regex=True)

#### Finally we have clean data!

#### Step 3 : Exploratory Data Analysis

##### What kind of answers/analysis can we do on this dataset? 
* On average how many tweets per-day, per-week, per-month?
* Who tweets the most about climate?
  * On average how many tweets/day, tweets/week, tweets/month?
  * Does the frequency of tweets changes on weekly/monthly bases? 
* Who tweets the least about climate?
* What are the most commmon hash-tags?
* What are the least common hash-tags?
* How many tweets have media links?
* Patterns in Tweet details
  * Average character count
  * Average word count.
* Any co-relation between tweet details and engagement? 

How many tweets per day? 
* Before we start doing that we'll need to convert timestamp to just date.  We'll do that by creating a new column

In [116]:
## How many tweets per day? 
## Before we start doing that we'll need to convert timestamp to just date.  We'll do that by creating a new column

tweets.loc[:,"timestamp"]

#convert column of timestamps to datetimes
tweets["date"] = tweets["timestamp"].apply(lambda x: x.date())

tweets["day"] = tweets["timestamp"].dt.day

tweets["month"] = tweets["timestamp"].dt.month

# Lets also split them into day and month to understand patterns. 
tweets.head()

Unnamed: 0,screen_name,user_name,timestamp,text,embedded_text,emojis,comments,likes,retweets,image_link,tweet_url,date,day,month
0,Lauren Boebert,@laurenboebert,2022-01-17 23:32:38+00:00,Lauren Boebert @laurenboebert · Jan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11700,[],https://twitter.com/laurenboebert/status/14832...,2022-01-17,17,1
1,Catherine,@catherine___c,2022-01-17 22:54:02+00:00,Catherine @catherine___c · Jan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...,2022-01-17,17,1
2,king Keith,@KaConfessor,2022-01-17 23:51:41+00:00,king Keith @KaConfessor · Jan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...,2022-01-17,17,1
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17 21:42:04+00:00,PETRIFIED CLIMATE PARENT @climate_parent · Jan 17,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...,2022-01-17,17,1
4,Thomas Speight,@Thomas_Sp8,2022-01-17 21:10:40+00:00,Thomas Speight @Thomas_Sp8 · Jan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...,2022-01-17,17,1


In [117]:
## to calculate tweets per day, we'll need to group by date, and then find mean. 
pd.DataFrame(tweets.groupby(["date"])["tweet_url"].count())


## Converted graph logic into functions so that its easy to create dynamic dash app later on. 
def get_monthly_tweets(month):
     monthly_condition = tweets["month"] == month
     monthly_tweets = pd.DataFrame(tweets[monthly_condition].groupby(["date"],as_index=False)["tweet_url"].count()).rename(columns={"tweet_url": "total_tweets"})
     return monthly_tweets

monthly_tweets = get_monthly_tweets(1)
monthly_fig = px.line(monthly_tweets, x="date", y="total_tweets", title='Climate Change Tweets in Month of Jan')
monthly_fig.show()

What are the average tweets per day?

In [118]:
## average tweets per day
tweets.groupby(["date"])["tweet_url"].count().mean()

49.45355191256831

In [119]:
## when was the minimum number of tweets
tweets.groupby(["date"])["tweet_url"].count().idxmin()

datetime.date(2022, 6, 1)

Which day had the max number of tweets?

In [120]:
## when was max number of tweet
tweets.groupby(["date"])["tweet_url"].count().idxmax()

datetime.date(2022, 7, 12)

What are average tweets per month?

In [121]:
## to calculate tweets per month, we'll need to group by date, and then find mean. 
pd.DataFrame(tweets.groupby(["month"])["tweet_url"].count())


## Lets plot this in a line chart to understand the trend for the year

## Converted graph logic into functions so that its easy to create dynamic dash app later on. 
def get_yearly_trend():
     monthly_tweets = pd.DataFrame(tweets.groupby(["month"],as_index=False)["tweet_url"].count()).rename(columns={"tweet_url": "total_tweets"})
     return monthly_tweets

ytd_tweets = get_yearly_trend()

ytd_fig = px.line(ytd_tweets, x="month", y="total_tweets", title='Climate Change Tweets YTD')
ytd_fig.show()


In [122]:
## average tweets per month
tweets.groupby(["month"])["tweet_url"].count().mean()

1292.857142857143

In [123]:
# 2. Who tweets the most about climate
print(tweets.groupby(["user_name"])["tweet_url"].count().idxmax())
##    2.1. On average how many tweets/day, tweets/week, tweets/month
## tweets/day
print(tweets[tweets["user_name"] == tweets.groupby(["user_name"])["tweet_url"].count().idxmax()].groupby(["day"])["tweet_url"].count().mean())

## tweets/month
print(tweets[tweets["user_name"] == tweets.groupby(["user_name"])["tweet_url"].count().idxmax()].groupby(["month"])["tweet_url"].count().mean())
##    2.2. Does the frequency of tweets changes on weekly/monthly bases? 

## Frequency of tweets per-month
print(tweets[tweets["user_name"] == tweets.groupby(["user_name"])["tweet_url"].count().idxmax()].groupby(["month"])["tweet_url"].count())


@insideclimate
1.6363636363636365
5.142857142857143
month
1     4
2     3
3     3
4     6
5     9
6    10
7     1
Name: tweet_url, dtype: int64


In [124]:
## How many users were tweeting about climate change
len(tweets["user_name"].unique())

## Users and number of tweets.
tweets["user_name"].value_counts()

@insideclimate      36
@climatecouncil     31
@ClimateBen         27
@great_thunberg     22
@ECOWARRIORSS       21
                    ..
@gavan_mcfadzean     1
@PoliBard            1
@SecVilsack          1
@OhBeing             1
@robothusiast        1
Name: user_name, Length: 7036, dtype: int64

In [125]:
## Engagements on tweet?

## What was average likes on a tweet?
print("Average Likes : ", tweets["likes"].mean())

## What was max likes on a tweet?
print("Max Likes : ",tweets["likes"].max())

## Who got the max likes?
print("Who got maximum likes? ", tweets.loc[tweets["likes"].idxmax(), ["user_name"]][0])

## How much did this person tweet?
print("Number of tweets by max liked tweet user", len(tweets[tweets["user_name"] == tweets.loc[tweets["likes"].idxmax(), ["user_name"]][0]]))

## What was the average comments 
print("Average Comments : ", tweets["comments"].mean())

print("----------")

## What was max likes on a tweet?
print("Max Comments : ",tweets["comments"].max())

## Who got the max likes?
print("Who got max likes? ", tweets.loc[tweets["comments"].idxmax(), ["user_name"]][0])

## How much did this person tweet?
print("Number of tweets by max commented tweet user", len(tweets[tweets["user_name"] == tweets.loc[tweets["comments"].idxmax(), ["user_name"]][0]]))

print("----------")

## What was the average retweets 
print("Average retweets : ", tweets["retweets"].mean())

## What was max retweets on a tweet?
print("Max retweets : ",tweets["retweets"].max())

## Who got the max retweets?
print("Who got max retweets?", tweets.loc[tweets["retweets"].idxmax(), ["user_name"]][0])

## How much did this person tweet?
print("Number of tweets by max retweets  user", len(tweets[tweets["user_name"] == tweets.loc[tweets["retweets"].idxmax(), ["user_name"]][0]]))


Average Likes :  72.22077348066298
Max Likes :  21300
Who got maximum likes?  @WhipRealer
Number of tweets by max liked tweet user 1
Average Comments :  18.805193370165746
----------
Max Comments :  6746
Who got max likes?  @Ek7_PrN
Number of tweets by max commented tweet user 1
----------
Average retweets :  299.6942541436464
Max retweets :  150500
Who got max retweets? @POTUS
Number of tweets by max retweets  user 2


In [126]:
## 4. Most commmon hash-tags

## Initialize hashtags column
tweets["hashtags"] = ""
tweets["hashtag_count"] = 0


## Lets create a function to run for each rows
def set_hashtags(row): 
     ## if there are words starting with #
     if(row["embedded_text"].find("#") != -1):
          hashtags = [hashtags for hashtags in row["embedded_text"].split(' ') if hashtags.startswith('#')]
          row["hashtags"] = ",".join(hashtags)
          row["hashtag_count"] = len(hashtags)
            
     return row


# ## Lets start with finding strings with hastags
hastag_condition = tweets["embedded_text"].str.find("#") != -1

# # tweets[hastag_condition]["embedded_text"].str.split(' ')

# # for tags in tweets[hastag_condition]["embedded_text"].str.split(' '):
# #      hashtags = [hashtags.strip("\n") for hashtags in tags if hashtags.startswith('#')]
# #      print(hashtags)
tweets = tweets.apply(set_hashtags, axis="columns")