# EDA
The aim of this notebook is to figure out:

- How to collect raw tweets (with geolocation, handles, hashtags, mentions)
- Which people live in Argentina (maybe through user locations)
- How to track their movements over time
- How long to track their movements over time

In [2]:
import pandas as pd # for dataframes
#import json
import snscrape.modules.twitter as sntwitter
from _datetime import datetime, timedelta
#from multiprocessing import Pool
#import tqdm # for progress bar
import pytz # for current time
import pandas as pd
#import itertools

In [None]:
# set datetime parameters for scraping tweets

now = datetime.now().replace(tzinfo=pytz.utc)
start_date = (now - timedelta(days=1)).strftime('%Y-%m-%d')
end_date = (now + timedelta(days=1)).strftime('%Y-%m-%d')
date_window = str(start_date) + "-" + str(end_date)
scraper = sntwitter.TwitterSearchScraper

In [None]:
# the data that currently is in Equinor/data/input_data/argentina_daily_tweets.csv is df
# this was done on 27.09.2022 for 1 day, and yielded 300k+ tweets

df = pd.DataFrame(scraper(f'near:Argentina within:10km since:{start_date} until:{end_date}').get_items())
print(df)

In [None]:
# creating a new dataframe so we don't mess up the original
# putting in extra columns with user information

df_coord = df
df_coord['user_username'] =  df_coord['user'].apply(lambda x: x['username'])
df_coord['user_id'] =  df_coord['user'].apply(lambda x: x['id'])
df_coord['user_verified'] =  df_coord['user'].apply(lambda x: x['verified'])
df_coord['user_created'] =  df_coord['user'].apply(lambda x: x['created'])
df_coord['user_location'] =  df_coord['user'].apply(lambda x: x['location'])
df_coord['user_followers_count'] = df_coord['user'].apply(lambda x: x['followersCount'])
df_coord['user_friends_count'] = df_coord['user'].apply(lambda x: x['friendsCount'])
df_coord['user_statuses_count'] = df_coord['user'].apply(lambda x: x['statusesCount'])
df_coord['user_favourites_count'] =  df_coord['user'].apply(lambda x: x['favouritesCount'])
df_coord['user_listed_count'] =  df_coord['user'].apply(lambda x: x['listedCount'])
df_coord['user_media_count'] =  df_coord['user'].apply(lambda x: x['mediaCount'])
df_coord['user_protected'] = df_coord['user'].apply(lambda x: x['protected'])
df_coord['user_label'] =  df_coord['user'].apply(lambda x: x['label'])

# saved this to 
df_coord.to_csv('../../../Equinor/data/input_data/argentina_modified_daily_tweets.csv', index=False)

In [None]:
# exploring the data found
df.user_username.value_counts(dropna=False) # not too many from each, which is good
df.user_location.value_counts(dropna=False) # mostly argentina, with a few more specific locations within argentina; needs filtered
df.user_verified.value_counts(dropna=False) # as expected, most are unverified

In [None]:
# to be done next time for more precision
df2 = pd.DataFrame(scraper(f'geocode:-31.416668,-64.183334,10km since:{start_date} until:{end_date}').get_items())
print(df2)

In [3]:
# reading in dataset from tweeters from 27.09.2022

df_old = pd.read_csv('/Users/yelizavetasalo/Desktop/MT5599/Project/MT5599/Equinor/code/data/input_data/argentina_modified_daily_tweets.csv')

  df_old = pd.read_csv('/Users/yelizavetasalo/Desktop/MT5599/Project/MT5599/Equinor/code/data/input_data/argentina_modified_daily_tweets.csv')


In [4]:
df_old.columns

Index(['url', 'date', 'content', 'renderedContent', 'id', 'user', 'replyCount',
       'retweetCount', 'likeCount', 'quoteCount', 'conversationId', 'lang',
       'source', 'sourceUrl', 'sourceLabel', 'outlinks', 'tcooutlinks',
       'media', 'retweetedTweet', 'quotedTweet', 'inReplyToTweetId',
       'inReplyToUser', 'mentionedUsers', 'coordinates', 'place', 'hashtags',
       'cashtags', 'user_username', 'user_id', 'user_verified', 'user_created',
       'user_location', 'user_followers_count', 'user_friends_count',
       'user_statuses_count', 'user_favourites_count', 'user_listed_count',
       'user_media_count', 'user_protected', 'user_label'],
      dtype='object')

In [70]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 500)

In [104]:
# to standardise the locations a bit, remove the whitespace around countries

df_old["user_location"] = df_old["user_location"].str.strip()

In [105]:
# seeing how many people's user locations were "Argentina" (likely where they live)

pd.set_option('display.max_rows', None)
df_old.user_location.value_counts()

Argentina                                  159570
Honduras                                    27541
San José, Costa Rica                        11009
argentina                                   10831
Heredia, Costa Rica                          3117
ARGENTINA                                    2924
Canadá                                       2599
Alajuela, Costa Rica                         2400
Cartago, Costa Rica                          2368
Puntarenas, Costa Rica                       1077
San Jose, Costa Rica                          974
La Boca                                       557
Argentina🇦🇷                                   465
Paraíso                                       425
Bataan                                        384
Escazu, Costa Rica                            347
Grecia                                        342
luisa                                         310
San Pedro, Costa Rica                         296
Esparza, Costa Rica                           259


In [15]:
# filtering by those whose permanent location is Argentina, saving it as text file 

import numpy as np

argentina_users = df_old[df_old["user_location"].isin(["Argentina", "argentina", "ARGENTINA", "Argentina🇦🇷"])]["user_username"].unique()

with open("../data/argentina_users.txt", "w") as f:
    for handle in argentina_users:
        f.write(f"{handle}\n")

In [55]:
# reading in scraped dataset from 100 of those handles

df_fromusers = pd.read_json("/Users/yelizavetasalo/Desktop/MT5599/Project/MT5599/Project/code/data/argentinian_users_90days_1.json")
df_fromusers

Unnamed: 0,id,user,DateTime,tweet,user_location,place,coordinates
0,1575115736665665536,Unluckyblaaack,2022-09-28 13:30:30+00:00,@Zephyrath @todorokeh @BAYC2745 Good proyect 🔥,Argentina,,
1,1575073674595799040,Unluckyblaaack,2022-09-28 10:43:22+00:00,@DK_Promotes @CensoredBoysNFT @funguysthenft @...,Argentina,,
2,1574935145752035328,Unluckyblaaack,2022-09-28 01:32:54+00:00,@KevinPromotes @wenmyfrens @funguysthenft @Pul...,Argentina,,
3,1574935045462052864,Unluckyblaaack,2022-09-28 01:32:30+00:00,@Castg19 @Senzoril Done,Argentina,,
4,1574934903807827968,Unluckyblaaack,2022-09-28 01:31:56+00:00,@Verseshoe @jasonderulo @funguysthenft,Argentina,,
...,...,...,...,...,...,...,...
711616,1444475017631342592,LolaLo1974,2021-10-03 01:30:55+00:00,@Divaneval79 Lindas!🥰,Argentina,,
711617,1444474918113091584,LolaLo1974,2021-10-03 01:30:32+00:00,@Fotobulla Ohhhhh!👏🏻👏🏻👏🏻👏🏻🥰,Argentina,,
711618,1444474852614852608,LolaLo1974,2021-10-03 01:30:16+00:00,@Jiiiiii 🥰,Argentina,,
711619,1444453620099989504,LolaLo1974,2021-10-03 00:05:54+00:00,@Caviar_Marolio 😂,Argentina,,


In [56]:
# creating dataset of those with geolocation (coordinates and place)

users_with_coordinates = df_fromusers[df_fromusers.place != 'None'].reset_index(drop=True)

In [104]:
# changing format of "place" column so we get more information on each

import re

users_with_coordinates['place_fullname'] = 0
users_with_coordinates['place_name'] = 0
users_with_coordinates['place_type'] = 0
users_with_coordinates['place_country'] = 0
users_with_coordinates['place_countrycode'] = 0

temp_df = users_with_coordinates.place.str.replace("\"", "'")

for i in range(users_with_coordinates.shape[0]):
    temp = re.split("',|='", temp_df[i])
    users_with_coordinates['place_fullname'][i] = temp[1]
    users_with_coordinates['place_name'][i] = temp[3]
    users_with_coordinates['place_type'][i] = temp[5]
    users_with_coordinates['place_country'][i] = temp[7]
    users_with_coordinates['place_countrycode'][i] = temp[9]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_coordinates['place_fullname'][i] = temp[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_coordinates['place_name'][i] = temp[3]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_coordinates['place_type'][i] = temp[5]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_coo

In [105]:
users_with_coordinates.place_countrycode.value_counts()

AR')    1763
MX')    1270
DK')      44
AT')      39
CZ')      21
GB')       8
NL')       8
UY')       4
FR')       2
Name: place_countrycode, dtype: int64

In [106]:
users_with_coordinates.coordinates

0       Coordinates(longitude=-63.39386, latitude=-41....
1       Coordinates(longitude=-63.39386, latitude=-41....
2       Coordinates(longitude=-63.39386, latitude=-41....
3       Coordinates(longitude=-63.39386, latitude=-41....
4       Coordinates(longitude=-63.39386, latitude=-41....
                              ...                        
3154    Coordinates(longitude=-58.53815518510105, lati...
3155    Coordinates(longitude=-58.53815518510105, lati...
3156    Coordinates(longitude=-58.53927033342667, lati...
3157    Coordinates(longitude=-58.53927033342667, lati...
3158    Coordinates(longitude=-58.53927033342667, lati...
Name: coordinates, Length: 3159, dtype: object

In [125]:
# changing format of "place" column so we get more information on each

import re

users_with_coordinates['coordinates_latitude'] = 0
users_with_coordinates['coordinates_longitude'] = 0

#temp_df = users_with_coordinates.place.str.replace("\"", "'")

for i in range(users_with_coordinates.shape[0]):
    temp = re.split("\(|\)|=|,", users_with_coordinates.coordinates[i])
    users_with_coordinates['coordinates_latitude'][i] = float(temp[2])
    users_with_coordinates['coordinates_longitude'][i] = float(temp[4])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_coordinates['coordinates_latitude'][i] = float(temp[2])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_coordinates['coordinates_longitude'][i] = float(temp[4])


In [128]:
users_with_coordinates

Unnamed: 0,id,user,DateTime,tweet,user_location,place,coordinates,place_fullname,place_name,place_type,place_country,place_countrycode,coordinates_latitude,coordinates_longitude
0,1549489968585768960,MateoReyesLara,2022-07-19 20:22:51+00:00,@sebavs_ @soyalbertosamid Como podes amar a es...,Argentina,"Place(fullName='Buenos Aires, Argentina', name...","Coordinates(longitude=-63.39386, latitude=-41....","Buenos Aires, Argentina",Buenos Aires,admin,Argentina,AR'),-63.393860,-41.035009
1,1549467948007530496,MateoReyesLara,2022-07-19 18:55:21+00:00,@Whiskygaltieri4 @Polaco_Maxi @Surdo__ @_matia...,Argentina,"Place(fullName='Buenos Aires, Argentina', name...","Coordinates(longitude=-63.39386, latitude=-41....","Buenos Aires, Argentina",Buenos Aires,admin,Argentina,AR'),-63.393860,-41.035009
2,1548841309640671232,MateoReyesLara,2022-07-18 01:25:18+00:00,River chupame bien la verga,Argentina,"Place(fullName='Buenos Aires, Argentina', name...","Coordinates(longitude=-63.39386, latitude=-41....","Buenos Aires, Argentina",Buenos Aires,admin,Argentina,AR'),-63.393860,-41.035009
3,1548780995549270016,MateoReyesLara,2022-07-17 21:25:38+00:00,Ayer dije “todas putas” al lado de una que me ...,Argentina,"Place(fullName='Buenos Aires, Argentina', name...","Coordinates(longitude=-63.39386, latitude=-41....","Buenos Aires, Argentina",Buenos Aires,admin,Argentina,AR'),-63.393860,-41.035009
4,1548774875279626240,MateoReyesLara,2022-07-17 21:01:19+00:00,@ilnottpott @strokerarolinga @lulasickof17 @lu...,Argentina,"Place(fullName='Buenos Aires, Argentina', name...","Coordinates(longitude=-63.39386, latitude=-41....","Buenos Aires, Argentina",Buenos Aires,admin,Argentina,AR'),-63.393860,-41.035009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3154,1484425194793930752,LolaLo1974,2022-01-21 07:18:40+00:00,@GiseLosinno @LoreRial1 @marito_pares @Augusto...,Argentina,"Place(fullName='Duty Free Shop', name='Duty Fr...","Coordinates(longitude=-58.53815518510105, lati...",Duty Free Shop,Duty Free Shop,poi,Argentina,AR'),-58.538155,-34.814281
3155,1484425133724823552,LolaLo1974,2022-01-21 07:18:25+00:00,@Gui_patagonico Gracias Guille!!,Argentina,"Place(fullName='Duty Free Shop', name='Duty Fr...","Coordinates(longitude=-58.53815518510105, lati...",Duty Free Shop,Duty Free Shop,poi,Argentina,AR'),-58.538155,-34.814281
3156,1484425065064062976,LolaLo1974,2022-01-21 07:18:09+00:00,@marianofrattura Gracias!!!,Argentina,Place(fullName='Aeropuerto Internacional de Ez...,"Coordinates(longitude=-58.53927033342667, lati...",Aeropuerto Internacional de Ezeiza - Ministro ...,Aeropuerto Internacional de Ezeiza - Ministro ...,poi,Argentina,AR'),-58.539270,-34.813526
3157,1484425012954079232,LolaLo1974,2022-01-21 07:17:57+00:00,@luazzem Gracias!!!!,Argentina,Place(fullName='Aeropuerto Internacional de Ez...,"Coordinates(longitude=-58.53927033342667, lati...",Aeropuerto Internacional de Ezeiza - Ministro ...,Aeropuerto Internacional de Ezeiza - Ministro ...,poi,Argentina,AR'),-58.539270,-34.813526


In [129]:
users_with_coordinates.to_csv('../data/cleaned_argentinians_04_10_2022.csv')

In [130]:
users_with_coordinates.coordinates_latitude.value_counts()

-87.777003    1206
-72.314275     868
-58.531792     191
-73.572716     156
-58.502543     142
              ... 
-55.281587       1
-55.242027       1
-58.372169       1
-58.726652       1
 13.692183       1
Name: coordinates_latitude, Length: 72, dtype: int64

In [131]:
users_with_coordinates.coordinates_longitude.value_counts()

 20.356869    1206
-50.357259     868
-34.674453     191
-52.395819     155
-34.705421     142
              ... 
-34.887557       1
-34.901050       1
-34.608477       1
-34.699020       1
 47.528642       1
Name: coordinates_longitude, Length: 73, dtype: int64

In [137]:
pd.set_option('display.max_rows', None)
users_with_coordinates.place_country.value_counts()

Argentina          1763
Mexico             1270
Denmark              44
Austria              39
Czech Republic       21
United Kingdom        8
The Netherlands       8
Uruguay               4
France                2
Name: place_country, dtype: int64

In [138]:
users_with_coordinates.DateTime

0      2022-07-19 20:22:51+00:00
1      2022-07-19 18:55:21+00:00
2      2022-07-18 01:25:18+00:00
3      2022-07-17 21:25:38+00:00
4      2022-07-17 21:01:19+00:00
5      2022-07-17 20:06:33+00:00
6      2022-07-17 20:06:19+00:00
7      2022-07-17 20:01:32+00:00
8      2022-07-17 18:59:39+00:00
9      2022-07-15 23:55:51+00:00
10     2022-07-15 23:54:37+00:00
11     2022-07-15 23:53:09+00:00
12     2022-07-15 04:23:16+00:00
13     2022-07-14 03:42:42+00:00
14     2022-07-14 00:45:16+00:00
15     2022-07-13 19:00:50+00:00
16     2022-07-13 15:40:22+00:00
17     2022-07-13 02:37:06+00:00
18     2022-07-12 21:40:40+00:00
19     2022-07-12 12:23:12+00:00
20     2022-07-11 04:13:37+00:00
21     2022-07-11 01:31:53+00:00
22     2022-07-09 01:52:39+00:00
23     2022-07-07 02:58:06+00:00
24     2022-07-07 02:32:51+00:00
25     2022-07-07 02:29:26+00:00
26     2022-07-07 02:28:53+00:00
27     2022-07-04 04:14:16+00:00
28     2022-07-03 05:22:46+00:00
29     2022-07-01 11:45:20+00:00
30     202

In [147]:
users_with_coordinates[users_with_coordinates.user == "LolaLo1974"]

Unnamed: 0,id,user,DateTime,tweet,user_location,place,coordinates,place_fullname,place_name,place_type,place_country,place_countrycode,coordinates_latitude,coordinates_longitude
639,1553486694984417280,LolaLo1974,2022-07-30 21:04:25+00:00,@cuervo_diego Es bajito igual….,Argentina,"Place(fullName='González Catán, Argentina', na...","Coordinates(longitude=-58.714401, latitude=-34...","González Catán, Argentina",González Catán,city,Argentina,AR'),-58.714401,-34.91004
640,1553486609470963712,LolaLo1974,2022-07-30 21:04:04+00:00,@BUBYTTALT 🥰,Argentina,"Place(fullName='González Catán, Argentina', na...","Coordinates(longitude=-58.714401, latitude=-34...","González Catán, Argentina",González Catán,city,Argentina,AR'),-58.714401,-34.91004
641,1553486576520413184,LolaLo1974,2022-07-30 21:03:56+00:00,@Maximil98392812 Claro al exterior es mucho má...,Argentina,"Place(fullName='González Catán, Argentina', na...","Coordinates(longitude=-58.714401, latitude=-34...","González Catán, Argentina",González Catán,city,Argentina,AR'),-58.714401,-34.91004
642,1553486463047696384,LolaLo1974,2022-07-30 21:03:29+00:00,@Gabriel22561764 Gracias 🥰,Argentina,"Place(fullName='González Catán, Argentina', na...","Coordinates(longitude=-58.714401, latitude=-34...","González Catán, Argentina",González Catán,city,Argentina,AR'),-58.714401,-34.91004
643,1553486438699859968,LolaLo1974,2022-07-30 21:03:24+00:00,@Omar24616699 Igual llego a un poquito más!!!!,Argentina,"Place(fullName='González Catán, Argentina', na...","Coordinates(longitude=-58.714401, latitude=-34...","González Catán, Argentina",González Catán,city,Argentina,AR'),-58.714401,-34.91004
644,1553486356503986176,LolaLo1974,2022-07-30 21:03:04+00:00,@WVelazquezPh 🥰🥰🥰,Argentina,"Place(fullName='González Catán, Argentina', na...","Coordinates(longitude=-58.714401, latitude=-34...","González Catán, Argentina",González Catán,city,Argentina,AR'),-58.714401,-34.91004
645,1553486330042228736,LolaLo1974,2022-07-30 21:02:58+00:00,@HVarelak Si claro!,Argentina,"Place(fullName='González Catán, Argentina', na...","Coordinates(longitude=-58.714401, latitude=-34...","González Catán, Argentina",González Catán,city,Argentina,AR'),-58.714401,-34.91004
646,1553402924897894400,LolaLo1974,2022-07-30 15:31:32+00:00,Volamos a 10500 metros aproximadamente. #Dato,Argentina,"Place(fullName='Santa Cruz, Argentina', name='...","Coordinates(longitude=-73.572716, latitude=-52...","Santa Cruz, Argentina",Santa Cruz,admin,Argentina,AR'),-73.572716,-52.395819
647,1553401925827280896,LolaLo1974,2022-07-30 15:27:34+00:00,@Cristia49246647 Como va!!,Argentina,"Place(fullName='Santa Cruz, Argentina', name='...","Coordinates(longitude=-73.572716, latitude=-52...","Santa Cruz, Argentina",Santa Cruz,admin,Argentina,AR'),-73.572716,-52.395819
648,1553401201533788160,LolaLo1974,2022-07-30 15:24:41+00:00,@barscham @revista_bang 🥰,Argentina,"Place(fullName='Santa Cruz, Argentina', name='...","Coordinates(longitude=-73.572716, latitude=-52...","Santa Cruz, Argentina",Santa Cruz,admin,Argentina,AR'),-73.572716,-52.395819
