### Load options, Groupby, Lambdas and Functions

This notebook gives you more sophisticated methods of maniputaling your data using lambda functions and more customized functions as well as a way to create more sophisticated ways to summarize your data. 

In [1]:
import pandas as pd

**NEW:** 
Every function comes with various options you can specify. Below we are looking at assigning data types you can assign to each column

In [2]:
tweets = pd.read_csv('../data/iranian_tweets_csv_hashed.csv', 
                     dtype = {'tweetid': 'str','retweet_tweetid':'str'}, # <-- here you can specify data as strings, floats or integers,
                     parse_dates = ['account_creation_date', 'tweet_time'] # <-- this line makes pandas interpret these columns as dates   
)

In [3]:
len(tweets)

1122936

In [4]:
tweets.dtypes

tweetid                             object
userid                              object
user_display_name                   object
user_screen_name                    object
user_reported_location              object
user_profile_description            object
user_profile_url                    object
follower_count                       int64
following_count                      int64
account_creation_date       datetime64[ns]
account_language                    object
tweet_language                      object
tweet_text                          object
tweet_time                  datetime64[ns]
tweet_client_name                   object
in_reply_to_tweetid                float64
in_reply_to_userid                  object
quoted_tweet_tweetid               float64
is_retweet                            bool
retweet_userid                      object
retweet_tweetid                     object
latitude                           float64
longitude                          float64
quote_count

In [5]:
tweets.head().T

Unnamed: 0,0,1,2,3,4
tweetid,533622371429543936,527205814906654721,545166827350134784,538045437316321280,530053681668841472
userid,299148448,299148448,299148448,299148448,299148448
user_display_name,Maria Luis,Maria Luis,Maria Luis,Maria Luis,Maria Luis
user_screen_name,marialuis91,marialuis91,marialuis91,marialuis91,marialuis91
user_reported_location,"Nantes, France","Nantes, France","Nantes, France","Nantes, France","Nantes, France"
user_profile_description,journaliste indépendante/un vrai journaliste e...,journaliste indépendante/un vrai journaliste e...,journaliste indépendante/un vrai journaliste e...,journaliste indépendante/un vrai journaliste e...,journaliste indépendante/un vrai journaliste e...
user_profile_url,,,,,
follower_count,8012,8012,8012,8012,8012
following_count,1450,1450,1450,1450,1450
account_creation_date,2011-05-15 00:00:00,2011-05-15 00:00:00,2011-05-15 00:00:00,2011-05-15 00:00:00,2011-05-15 00:00:00


In [6]:

tweets_2016_2018 = tweets[
    tweets['tweet_time'].dt.year > 2015
]

In [7]:
print(len(tweets_2016_2018))
tweets_2016_2018.head()

624685


Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,user_profile_url,follower_count,following_count,account_creation_date,...,latitude,longitude,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,poll_choices
24,1019648905879998465,299148448,Maria Luis,marialuis91,"Nantes, France",journaliste indépendante/un vrai journaliste e...,,8012,1450,2011-05-15,...,,,0.0,0.0,0.0,0.0,[],[],,
27,767405558848745472,299148448,Maria Luis,marialuis91,"Nantes, France",journaliste indépendante/un vrai journaliste e...,,8012,1450,2011-05-15,...,,,0.0,0.0,0.0,0.0,[],[http://fr.whatsupic.com/nouvelles-politiques-...,"[748935305630195712, 331946671, 3091521407]",
28,733203427304562688,299148448,Maria Luis,marialuis91,"Nantes, France",journaliste indépendante/un vrai journaliste e...,,8012,1450,2011-05-15,...,,,0.0,0.0,0.0,0.0,[],[http://fr.awdnews.com/soci%C3%A9t%C3%A9/l-%C3...,[504729181],
34,822409242376372225,299148448,Maria Luis,marialuis91,"Nantes, France",journaliste indépendante/un vrai journaliste e...,,8012,1450,2011-05-15,...,,,0.0,0.0,0.0,0.0,[],[http://whatsupic.com/news-politics-world/1476...,[935688830],
127,822409682568564736,299148448,Maria Luis,marialuis91,"Nantes, France",journaliste indépendante/un vrai journaliste e...,,8012,1450,2011-05-15,...,,,0.0,0.0,0.0,0.0,[],[http://whatsupic.com/news-politics-world/1476...,[2889322829],


In [8]:
grouped_tweets = tweets_2016_2018.groupby(['account_language','user_reported_location'])['tweetid'].count()

In [9]:
grouped_tweets

account_language  user_reported_location                               
ar                Amman                                                        9
                  Arab World                                                  14
                  Bahrain                                                      7
                  Kingdom of Saudi Arabia                                  10390
                  Libya                                                     4170
                  London                                                     156
                  Nederland                                                   17
                  Riyadh                                                    1133
                  Syrian Arab Republic                                      9428
                  The Netherlands                                            907
                  Yemen                                                     2415
                  jordan             

In [10]:
grouped_tweets.reset_index()

Unnamed: 0,account_language,user_reported_location,tweetid
0,ar,Amman,9
1,ar,Arab World,14
2,ar,Bahrain,7
3,ar,Kingdom of Saudi Arabia,10390
4,ar,Libya,4170
5,ar,London,156
6,ar,Nederland,17
7,ar,Riyadh,1133
8,ar,Syrian Arab Republic,9428
9,ar,The Netherlands,907


In [11]:
grouped_tweets.reset_index().sort_values(by = 'tweetid', ascending=False)

Unnamed: 0,account_language,user_reported_location,tweetid
72,en,Kingdom of Saudi Arabia,46600
136,en,"İstanbul, Türkiye 🇹🇷",38176
117,en,United States,14591
82,en,"Nantes, France",13694
77,en,Moscow,12994
180,en,فلسطين,12759
100,en,Riyadh,12194
116,en,United Kingdom,11624
3,ar,Kingdom of Saudi Arabia,10390
86,en,"New York, USA",9923


### Lambdas
Lambdas are functions you can use to manipulate your columns. Think of them as mini-functions. You use them in conjunction with the `.apply()` function.

In [12]:
num_tweets_per_country_language = grouped_tweets.reset_index().sort_values(by = 'tweetid', ascending=False)

In [13]:
num_tweets_per_country_language['tweetid'].apply(lambda x: x/len(tweets_2016_2018) *100)

72     7.459760
136    6.111240
117    2.335737
82     2.192145
77     2.080088
180    2.042469
100    1.952024
116    1.860778
3      1.663238
86     1.588481
185    1.530691
8      1.509241
176    1.406149
214    1.351881
113    1.303697
69     1.227659
213    1.207489
46     1.095432
31     1.036522
60     0.995382
205    0.878683
109    0.866677
66     0.741494
196    0.716521
61     0.696191
210    0.687707
48     0.668977
4      0.667536
172    0.622874
108    0.601583
         ...   
13     0.003522
146    0.003202
80     0.003202
175    0.002881
6      0.002721
204    0.002721
24     0.002561
107    0.002561
110    0.002401
1      0.002241
91     0.002081
193    0.001761
0      0.001441
151    0.001441
32     0.001281
102    0.001121
2      0.001121
104    0.001121
57     0.000800
88     0.000800
215    0.000800
14     0.000640
90     0.000480
97     0.000480
67     0.000480
207    0.000480
89     0.000480
33     0.000160
38     0.000160
208    0.000160
Name: tweetid, Length: 2

In [14]:
num_tweets_per_country_language['percent_of_all_tweets'] = num_tweets_per_country_language['tweetid'].apply(lambda x: (x/len(tweets_2016_2018)) *100)

In [15]:
num_tweets_per_country_language.head()

Unnamed: 0,account_language,user_reported_location,tweetid,percent_of_all_tweets
72,en,Kingdom of Saudi Arabia,46600,7.45976
136,en,"İstanbul, Türkiye 🇹🇷",38176,6.11124
117,en,United States,14591,2.335737
82,en,"Nantes, France",13694,2.192145
77,en,Moscow,12994,2.080088


Here's how you do the same thing with a function:

In [16]:
def calculate_pct(x):
    return (x/len(tweets_2016_2018)) * 100

In [17]:
num_tweets_per_country_language['percent_of_all_tweets2'] = num_tweets_per_country_language['tweetid'].apply(calculate_pct)

In [18]:
num_tweets_per_country_language.head()

Unnamed: 0,account_language,user_reported_location,tweetid,percent_of_all_tweets,percent_of_all_tweets2
72,en,Kingdom of Saudi Arabia,46600,7.45976,7.45976
136,en,"İstanbul, Türkiye 🇹🇷",38176,6.11124,6.11124
117,en,United States,14591,2.335737,2.335737
82,en,"Nantes, France",13694,2.192145,2.192145
77,en,Moscow,12994,2.080088,2.080088
