### Imports

In [1]:
## 0.A Import libraries
import os

# dates
from datetime import datetime

# data handling
import numpy as np
import pandas as pd

# Data viz
%matplotlib inline
import matplotlib.pyplot as plt

In [69]:
## 0.B Import functions
from utils.custom_functions import generate_descriptive_analysis
from utils.custom_functions import label_sent_analysis, label_political_entity

In [3]:
## O.C Arguments Paths
tweets_filename = r"01.Tweets.csv"
retweets_filename = r"02.Retweets.csv"
users_filename = r"03.Users.csv"

gravity_filename = r"gravity_1995_2014.dta"

trade_filenae = r"TradeData.csv"

gdp_filename_1 = r"gdp_1.csv"
gdp_filename_2 = r"gdp_2.csv"

population_filename = r"pop.csv"

country_code_filename = r"iso-country-codes.csv"

In [4]:
folder_path = os.path.dirname(os.getcwd())
path_to_data = os.path.join(folder_path,'data')

## 1. Data

In [5]:
## 1. Import data
users = pd.read_csv(os.path.join(path_to_data,users_filename),
    dtype = {'user_id': 'int64'}
)
users = users.drop(columns=['Unnamed: 0'])
# generate_descriptive_analysis(df = users, output_filename="users_descript") # Comment out 

# Let's have a look
users.head()

Unnamed: 0,user_id,bio,bio_lang,bio_en,verified,political,political_lang,political_en,date_joined,profile_image,profile_banner,profile_location,profile_location_lang,profile_location_country_en,num_tweets,media_count,followers,following
0,116680494,#news upate #uk news # austalia china # japan...,en 39,#news upate #uk news # austalia china # japan...,False,,,,2010-02-23 08:03:27+00:00,https://pbs.twimg.com/profile_images/713321288...,,The World in Tweets,en 75,,220821,0.0,667,11
1,172576367,Option Trader | Fundamental and Indicasies | M...,en 42,Option Trader | Fundamental and Indicasies | M...,False,,,,2010-07-30 01:45:04+00:00,https://pbs.twimg.com/profile_images/109116695...,,Bali - Indonesia,id 50,Indonesia,226257,0.0,28,21
2,154226261,"BBC Science and Tech Correspondent, based in s...",en 87,"BBC Science and Tech Correspondent, based in s...",False,,,,2010-06-10 18:10:08+00:00,https://pbs.twimg.com/profile_images/100002619...,https://pbs.twimg.com/profile_banners/15422626...,"Cambridge, England",en 30,United Kingdom,12505,830.0,7948,1417
3,61733677,Curious about finance,en 86,Curious about finance,False,,,,2009-07-31 09:23:42+00:00,https://abs.twimg.com/sticky/default_profile_i...,,"Ulaanbaatar, Mongolia",en 36,THE MOST KNOW ᠮᠤᠩᠭᠤᠯ ᠤᠯᠤᠰ,18502,0.0,322,4
4,87775422,Start a conversation as we share news and anal...,en 95,Start a conversation as we share news and anal...,True,China state-affiliated media,en 85,China state-affiliated media,2009-11-05 20:30:10+00:00,https://pbs.twimg.com/profile_images/133814999...,https://pbs.twimg.com/profile_banners/87775422...,"Beijing, China",en 71,China,161125,84967.0,4261985,577


In [6]:
# a. Top Ten countries represented?
users.profile_location_country_en.value_counts().sort_values()[-10:]

France             3821
Luzon              4424
China              4718
Canada             5839
Indonesia          5977
Australia          8966
United Kingdom    12082
Pakistan          12245
India             16375
United States     37778
Name: profile_location_country_en, dtype: int64

Tweets data

In [7]:
## 1. Import data
tweets = pd.read_csv(os.path.join(path_to_data, tweets_filename),
    dtype = {'user_id': 'int64'},
    parse_dates=['timestamp']
)
tweets = tweets.drop(columns=['Unnamed: 0'])

# generate_descriptive_analysis(df = tweets, output_filename="tweets_descript") # Comment out 

# Let's have a look
tweets.head()

  tweets = pd.read_csv(os.path.join(path_to_data, tweets_filename),


Unnamed: 0,user_id,timestamp,tweet_id,sentiment_polarity,text_lang_ft,text_normalized,links,hashtag,hashtag_lang,hashtag_en,...,GIF_url,likes,retweets,replies,reply_to_user,mentioned_users,quoted_tweet,quoted_by_count,credibility,tweet_source
0,116680494,2013-09-03 02:22:09+00:00,374718928682885121,0.2732,en 88,"['nation', 'agree', 'build', 'new', 'silk', 'r...",http://bit.ly/17lyTPM,,,,...,,0,0,0,,,,0,1.0,twitterfeed
1,172576367,2013-09-03 02:22:11+00:00,374718937889402880,0.2732,en 84,"['nation', 'agree', 'build', 'new', 'silk', 'r...",http://bit.ly/17lySv6,,,,...,,0,0,0,,,,0,1.0,twitterfeed
2,154226261,2013-09-03 10:11:50+00:00,374837127873175553,0.0,en 47,"['high', 'speed', 'rail', 'china', 'new', 'sil...",,,,,...,,1,0,0,,83521919.0,,0,,Twitter for Websites
3,61733677,2013-09-03 11:33:26+00:00,374857665735704576,0.2732,en 65,"['nation', 'agree', 'build', 'new', 'silk', 'r...",,,,,...,,0,0,0,,,,0,,Twitter Web Client
4,87775422,2013-09-03 20:10:51+00:00,374987876737765376,0.0,en 56,"['china', 'kazakhstan', 'tajikistan', 'russia'...",http://usa.chinadaily.com.cn/epaper/2013-09/03...,China,en 50,China,...,,2,6,0,,,,0,0.0,Hootsuite


In [8]:
tweets['year'] = pd.DatetimeIndex(tweets['timestamp']).year

In [9]:
tweets.year.value_counts().sort_values(ascending=True)

2013      1564
2014      8889
2015     25652
2016     32434
2021     57982
2020     69133
2018     89069
2017    107923
2019    108064
Name: year, dtype: int64

Retweets data

In [20]:
## 1. Import data
retweets = pd.read_csv(os.path.join(path_to_data, retweets_filename),
    dtype = {'target': 'int64'}
)
retweets = retweets.drop(columns=['Unnamed: 0'])

# Let's have a look
retweets.head()

Unnamed: 0,source,target,timestamp,tweet_id
0,87775422.0,137271696,2013-09-03 21:09:47+00:00,3.749879e+17
1,87775422.0,338426531,2013-09-03 20:39:47+00:00,3.749879e+17
2,87775422.0,1483526748,2013-09-03 20:35:37+00:00,3.749879e+17
3,87775422.0,1443874496,2013-09-03 20:33:38+00:00,3.749879e+17
4,87775422.0,1613544854,2013-09-03 20:13:25+00:00,3.749879e+17


## Join Twitter datasets

In [10]:
tweets.columns

Index(['user_id', 'timestamp', 'tweet_id', 'sentiment_polarity',
       'text_lang_ft', 'text_normalized', 'links', 'hashtag', 'hashtag_lang',
       'hashtag_en', 'cashtag', 'media', 'image_url', 'video_url', 'GIF_url',
       'likes', 'retweets', 'replies', 'reply_to_user', 'mentioned_users',
       'quoted_tweet', 'quoted_by_count', 'credibility', 'tweet_source',
       'year'],
      dtype='object')

In [11]:
tweets['user_id'] = pd.to_numeric(tweets['user_id'])
users['user_id'] = pd.to_numeric(users['user_id'])
users[['user_id','profile_location_country_en', 'political_en']].head()
tweets = tweets[['tweet_id','user_id', 'sentiment_polarity', 'likes', 'retweets', 'year']]

I want the yealry number of tweets about the project by country, the number of tweets positive, the number of tweets negative, the number of tweets made by a political affiliate.

In [12]:
# 1. Joining twitter dataset with the users
merged_tweets = tweets.merge(users[['user_id','profile_location_country_en', 'political_en']], on='user_id', how='right')

In [13]:
merged_tweets['tweet_id'].count()

118413525

In [14]:
len(merged_tweets['tweet_id'].unique())

500701

In [26]:
merged_tweets.head()

Unnamed: 0,tweet_id,user_id,sentiment_polarity,likes,retweets,year,profile_location_country_en,political_en
0,3.747189e+17,116680494,0.2732,0.0,0.0,2013.0,,
1,3.762299e+17,116680494,0.0,0.0,0.0,2013.0,,
2,4.240629e+17,116680494,0.3818,0.0,0.0,2014.0,,
3,4.491235e+17,116680494,0.5719,0.0,0.0,2014.0,,
4,3.747189e+17,172576367,0.2732,0.0,0.0,2013.0,Indonesia,


In [15]:
merged_tweets = merged_tweets.dropna(subset='profile_location_country_en')
merged_tweets.head()

Unnamed: 0,tweet_id,user_id,sentiment_polarity,likes,retweets,year,profile_location_country_en,political_en
4,3.747189e+17,172576367,0.2732,0.0,0.0,2013.0,Indonesia,
5,3.762299e+17,172576367,0.0,0.0,0.0,2013.0,Indonesia,
6,4.240629e+17,172576367,0.3818,0.0,0.0,2014.0,Indonesia,
7,4.491235e+17,172576367,0.5719,0.0,0.0,2014.0,Indonesia,
8,3.748371e+17,154226261,0.0,1.0,0.0,2013.0,United Kingdom,


In [16]:

merged_tweets['tweet_id'].count()

43387003

In [17]:
len(merged_tweets['tweet_id'].unique())

348031

In [18]:
merged_tweets = merged_tweets.drop_duplicates(subset = 'tweet_id')

In [19]:
merged_tweets['tweet_id'].count()

348030

In [20]:
tweets = merged_tweets

In [21]:
tweets.head()

Unnamed: 0,tweet_id,user_id,sentiment_polarity,likes,retweets,year,profile_location_country_en,political_en
4,3.747189e+17,172576367,0.2732,0.0,0.0,2013.0,Indonesia,
5,3.762299e+17,172576367,0.0,0.0,0.0,2013.0,Indonesia,
6,4.240629e+17,172576367,0.3818,0.0,0.0,2014.0,Indonesia,
7,4.491235e+17,172576367,0.5719,0.0,0.0,2014.0,Indonesia,
8,3.748371e+17,154226261,0.0,1.0,0.0,2013.0,United Kingdom,


## Creating features

In [75]:
# if positive sentiment analysis, create a dummy equal to one, otherwise 0.
tweets["tweets_positive_sent"] = tweets["sentiment_polarity"].apply(label_sent_analysis)
tweets["tweets_political"] = tweets["political_en"].apply(label_political_entity)

In [76]:
tweets["tweets_political"].sum()

233398

In [73]:
tweets.political_en.replace('NaN', np.nan).dropna()

6                       China state-affiliated media
7                       China state-affiliated media
8                       China state-affiliated media
9                       China state-affiliated media
10                      China state-affiliated media
                             ...                    
335617    Media affiliated to the Chinese government
338230                  State-affiliated media China
344096                  China state-affiliated media
344097                  China state-affiliated media
346629                  China state-affiliated media
Name: political_en, Length: 10329, dtype: object

I will work on a yearly level. 
I want the aggregate count of tweets, their average sentiment polarity, the aggregate level of retweets, the average of retweets by month and by country

In [13]:
# Let's reduce dataset dimension and add a country code
columns= ["year","profile_location_country_en","tweets_positive_sent", "tweets_political",'likes','retweets']
tweets = tweets[columns]

In [23]:
country_code = pd.read_csv(os.path.join(path_to_data, country_code_filename))
country_code = country_code.rename(columns = {"English short name lower case": "country"})
tweets = pd.merge(tweets, country_code, left_on='profile_location_country_en',right_on='country', how='left')

In [24]:
tweets.head()

Unnamed: 0,tweet_id,user_id,sentiment_polarity,likes,retweets,year,profile_location_country_en,political_en,tweets_positive_sent,tweets_political,country,Alpha-2 code,Alpha-3 code,Numeric code,ISO 3166-2
0,3.747189e+17,172576367,0.2732,0.0,0.0,2013.0,Indonesia,,1,1,Indonesia,ID,IDN,360.0,ISO 3166-2:ID
1,3.762299e+17,172576367,0.0,0.0,0.0,2013.0,Indonesia,,0,1,Indonesia,ID,IDN,360.0,ISO 3166-2:ID
2,4.240629e+17,172576367,0.3818,0.0,0.0,2014.0,Indonesia,,1,1,Indonesia,ID,IDN,360.0,ISO 3166-2:ID
3,4.491235e+17,172576367,0.5719,0.0,0.0,2014.0,Indonesia,,1,1,Indonesia,ID,IDN,360.0,ISO 3166-2:ID
4,3.748371e+17,154226261,0.0,1.0,0.0,2013.0,United Kingdom,,0,1,United Kingdom,GB,GBR,826.0,ISO 3166-2:GB


In [25]:
tweets['Key'] = tweets['Alpha-3 code'] + '_' + tweets['year'].astype(str)

In [26]:
tweets.to_csv('junk_tweets_and_users.csv')

In [27]:
len(tweets['country'].unique())

141

In [28]:
tweets = pd.read_csv('junk_tweets_and_users.csv')

In [29]:
tweets.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,user_id,sentiment_polarity,likes,retweets,year,profile_location_country_en,political_en,tweets_positive_sent,tweets_political,country,Alpha-2 code,Alpha-3 code,Numeric code,ISO 3166-2,Key
0,0,3.747189e+17,172576367,0.2732,0.0,0.0,2013.0,Indonesia,,1,1,Indonesia,ID,IDN,360.0,ISO 3166-2:ID,IDN_2013.0
1,1,3.762299e+17,172576367,0.0,0.0,0.0,2013.0,Indonesia,,0,1,Indonesia,ID,IDN,360.0,ISO 3166-2:ID,IDN_2013.0
2,2,4.240629e+17,172576367,0.3818,0.0,0.0,2014.0,Indonesia,,1,1,Indonesia,ID,IDN,360.0,ISO 3166-2:ID,IDN_2014.0
3,3,4.491235e+17,172576367,0.5719,0.0,0.0,2014.0,Indonesia,,1,1,Indonesia,ID,IDN,360.0,ISO 3166-2:ID,IDN_2014.0
4,4,3.748371e+17,154226261,0.0,1.0,0.0,2013.0,United Kingdom,,0,1,United Kingdom,GB,GBR,826.0,ISO 3166-2:GB,GBR_2013.0


In [30]:
tweets = tweets.dropna(subset=['country'])

In [31]:
columns_stats = ["tweets_positive_sent", "tweets_political",'likes','retweets']
grouped = tweets.groupby(['country','year'])["tweets_positive_sent", "tweets_political",'likes','retweets'].agg(['sum','count']).reset_index()
grouped.head()

  grouped = tweets.groupby(['country','year'])["tweets_positive_sent", "tweets_political",'likes','retweets'].agg(['sum','count']).reset_index()


Unnamed: 0_level_0,country,year,tweets_positive_sent,tweets_positive_sent,tweets_political,tweets_political,likes,likes,retweets,retweets
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,count,sum,count,sum,count,sum,count
0,Afghanistan,2013.0,2,4,4,4,0.0,4,6.0,4
1,Afghanistan,2014.0,8,18,18,18,10.0,18,15.0,18
2,Afghanistan,2015.0,6,11,11,11,1.0,11,0.0,11
3,Afghanistan,2016.0,17,44,44,44,271.0,44,84.0,44
4,Afghanistan,2017.0,13,54,54,54,266.0,54,89.0,54


In [32]:
country_code = pd.read_csv(os.path.join(path_to_data, country_code_filename))
country_code = country_code.rename(columns = {"English short name lower case": "country"})

In [33]:
country_code

Unnamed: 0,country,Alpha-2 code,Alpha-3 code,Numeric code,ISO 3166-2
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX
2,Albania,AL,ALB,8,ISO 3166-2:AL
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ
4,American Samoa,AS,ASM,16,ISO 3166-2:AS
...,...,...,...,...,...
241,Wallis and Futuna,WF,WLF,876,ISO 3166-2:WF
242,Western Sahara,EH,ESH,732,ISO 3166-2:EH
243,Yemen,YE,YEM,887,ISO 3166-2:YE
244,Zambia,ZM,ZMB,894,ISO 3166-2:ZM


In [34]:
grouped = pd.merge(grouped,country_code[['country','Alpha-3 code']], left_on='country', right_on='country', how = 'left')

  grouped = pd.merge(grouped,country_code[['country','Alpha-3 code']], left_on='country', right_on='country', how = 'left')


In [55]:
grouped_refactored.to_csv("groupedv2_refactored.csv")

In [52]:
grouped_refactored = pd.read_csv('groupedv2_refactored.csv', 
sep = ';',
dtype={'Year':int}
)
grouped_refactored.drop('Unnamed: 0', axis=1, inplace = True)
grouped_refactored.rename(columns = {'Year':'year'}, inplace=True)
grouped_refactored.head()

Unnamed: 0,country,Country,year,sum_pos_tweets,count_tweets,sum_political_tweets,sum_likes,sum_retweeets,Alpha-3 code
0,Afghanistan,Afghanistan,2013,2,4,4,0.0,6.0,AFG
1,Afghanistan,Afghanistan,2014,8,18,18,10.0,15.0,AFG
2,Afghanistan,Afghanistan,2015,6,11,11,1.0,0.0,AFG
3,Afghanistan,Afghanistan,2016,17,44,44,271.0,84.0,AFG
4,Afghanistan,Afghanistan,2017,13,54,54,266.0,89.0,AFG


In [53]:
grouped_refactored['Key'] = grouped_refactored['Alpha-3 code'] + '_' + grouped_refactored['year'].astype(str)

In [54]:
grouped_refactored.head()

Unnamed: 0,country,Country,year,sum_pos_tweets,count_tweets,sum_political_tweets,sum_likes,sum_retweeets,Alpha-3 code,Key
0,Afghanistan,Afghanistan,2013,2,4,4,0.0,6.0,AFG,AFG_2013
1,Afghanistan,Afghanistan,2014,8,18,18,10.0,15.0,AFG,AFG_2014
2,Afghanistan,Afghanistan,2015,6,11,11,1.0,0.0,AFG,AFG_2015
3,Afghanistan,Afghanistan,2016,17,44,44,271.0,84.0,AFG,AFG_2016
4,Afghanistan,Afghanistan,2017,13,54,54,266.0,89.0,AFG,AFG_2017


In [44]:
grouped.columns

Index([                        'country',                   ('country', ''),
                            ('year', ''),   ('tweets_positive_sent', 'sum'),
       ('tweets_positive_sent', 'count'),       ('tweets_political', 'sum'),
           ('tweets_political', 'count'),                  ('likes', 'sum'),
                      ('likes', 'count'),               ('retweets', 'sum'),
                   ('retweets', 'count'),                    'Alpha-3 code'],
      dtype='object')

In [29]:
grouped.rename(columns={"('year', '')":'year'})

Unnamed: 0,country,"(country, )","(year, )","(tweets_positive_sent, sum)","(tweets_positive_sent, count)","(tweets_political, sum)","(tweets_political, count)","(likes, sum)","(likes, count)","(retweets, sum)","(retweets, count)",Alpha-3 code
0,Afghanistan,Afghanistan,2013,2,4,4,4,0,4,6,4,AFG
1,Afghanistan,Afghanistan,2014,8,18,18,18,10,18,15,18,AFG
2,Afghanistan,Afghanistan,2015,6,11,11,11,1,11,0,11,AFG
3,Afghanistan,Afghanistan,2016,142,468,468,468,459,468,194,468,AFG
4,Afghanistan,Afghanistan,2017,1923,4956,4956,4956,6618,4956,6953,4956,AFG
...,...,...,...,...,...,...,...,...,...,...,...,...
1066,Zimbabwe,Zimbabwe,2017,1143,3001,3001,3001,2491,3001,2008,3001,ZWE
1067,Zimbabwe,Zimbabwe,2018,4010,9240,9240,9240,19549,9240,8933,9240,ZWE
1068,Zimbabwe,Zimbabwe,2019,13077,28903,28903,28903,62871,28903,30636,28903,ZWE
1069,Zimbabwe,Zimbabwe,2020,7665,17976,17976,17976,55590,17976,18021,17976,ZWE


In [28]:
grouped.reset_index().columns

Index([                          'index',                         'country',
                         ('country', ''),                      ('year', ''),
         ('tweets_positive_sent', 'sum'), ('tweets_positive_sent', 'count'),
             ('tweets_political', 'sum'),     ('tweets_political', 'count'),
                        ('likes', 'sum'),                ('likes', 'count'),
                     ('retweets', 'sum'),             ('retweets', 'count'),
                          'Alpha-3 code'],
      dtype='object')

In [None]:
columns_stats = ["tweets_positive_sent", "tweets_political"]
junk = junk[columns_stats].agg(['sum','count'])

: 

: 

## Creation of final dataset

In [None]:
# Load Trade data
trade = pd.read_csv(os.path.join(path_to_data, trade_filenae))
trade.head()

In [None]:
# Keep only Chinese exports
trade = trade[trade['country']=='China']

In [None]:
# Load gdp data
gdp1 = pd.read_csv(os.path.join(path_to_data, gdp_filename_1))
gdp1.head()

In [None]:
# Load gdp data
gdp2 = pd.read_csv(os.path.join(path_to_data, gdp_filename_2))
gdp2.head()

In [None]:
# Load pop data
pop = pd.read_csv(os.path.join(path_to_data, population_filename))
pop.head()

In [None]:
# Load Gravity data
gravity = pd.read_stata(os.path.join(path_to_data, gravity_filename))
gravity.head()