# Stalking Users

In [2]:
import pandas as pd
import tqdm

# combining all into one dataframe for analysis

dfs = []

for i in tqdm.tqdm(range(0, 112)):
    df = pd.read_feather("s3://mt5599/final/processed_tweets_" + str(i) + ".feather")

    dfs.append(df)

100%|██████████| 112/112 [24:03<00:00, 12.89s/it]


In [3]:
df = pd.concat(dfs, axis=0, ignore_index=True)
df = df.drop_duplicates("id").reset_index()
df.to_feather("s3://mt5599/dissertation/stalking_users.feather")
df.shape[0]

182571272

In [12]:
df.columns

Index(['index', 'id', 'DateTime', 'coordinates', 'place', 'username',
       'user_id', 'user_location', 'tweet_content', 'lang', 'tweet_clean',
       'place_full_name', 'place_name', 'place_type', 'place_country',
       'place_country_code', 'coordinates_longitude', 'coordinates_latitude',
       'ner_type', 'ner_word', 'gmaps_lat', 'gmaps_long', 'gmaps_address',
       'distance'],
      dtype='object')

In [13]:
df.lang.value_counts(dropna=False)

es     158831071
und      4950029
pt       4898848
en       4268288
zxx      1251938
         ...    
te             2
or             2
km             2
my             2
ug             1
Name: lang, Length: 72, dtype: int64

In [8]:
print("There were a total of", df.shape[0], "tweets from ", len(df.username.unique()),
      "users whose tweets were collected and processed.",
     df[pd.notnull(df.place_full_name)].shape[0], " of those tweets were geotagged, ",
    df[df.ner_type == "LOC"].shape[0], "had a location mention, ",
     df[pd.notnull(df.gmaps_lat)].shape[0], "had a geoparsed location.")

There were a total of 182571272 tweets from  89369 users whose tweets were collected and processed. 56435136  of those tweets were geotagged,  5825772 had a location mention,  4391768 had a geoparsed location.


In [9]:
summary_table = df.groupby(['username'])["id"].count().reset_index(name='counts')
summary_table_geotagged = df[pd.notnull(df.place_full_name)].groupby(['username'])["id"].count().reset_index(name='counts')

In [10]:
import numpy as np

print("Over the relevant time period, Table  shows the summary statistics of tweets per user.")
print()
print("Mean & ", np.mean(summary_table.counts), " & ", np.mean(summary_table_geotagged.counts))
print("Median & ", np.median(summary_table.counts), " & ", np.median(summary_table_geotagged.counts))
print("Max & ", np.max(summary_table.counts), " & ", np.max(summary_table_geotagged.counts))
print("Min & ", np.min(summary_table.counts), " & ", np.min(summary_table_geotagged.counts))
print("Std & ", np.std(summary_table.counts), " & ", np.std(summary_table_geotagged.counts))

Over the relevant time period, Table  shows the summary statistics of tweets per user.

Mean &  2042.8926361490003  &  631.7672424408647
Median &  1164.0  &  286.0
Max &  155484  &  155464
Min &  1  &  1
Std &  2822.51317317166  &  1208.6650806269374


In [11]:
import pandas as pd

# users that tweeted from Argentina
argentina = df[df.place_country_code == "AR"].groupby(['username'])["id"].count().reset_index(name='counts_1')

all_ = df.groupby(['username'])["id"].count().reset_index(name='counts_2')

only_argentina = pd.merge(argentina, all_, how="left", on="username")
only_argentina["only_argentina"] = np.where(only_argentina["counts_1"] == only_argentina["counts_2"], "only AR", "elsewhere")
only_argentina = only_argentina[only_argentina.only_argentina == "only AR"]

print("Shape of df: ", only_argentina.shape)
print()
print("The number of users that tweeted only from Argentina was ", only_argentina.shape[0],
      ", which is ", only_argentina.shape[0] / len(all_.username.unique()) * 100,"% of the total users.",
     "This is interesting as they could have still travelled within Argentina.")

Shape of df:  (188, 4)

The number of users that tweeted only from Argentina was  188 , which is  0.21036377267285075 % of the total users. This is interesting as they could have still travelled within Argentina.


In [5]:
"""

import numpy as np
import tqdm

dates = pd.to_datetime(pd.date_range(start='2015-09-27', end='2016-11-06', freq='W', tz="UTC"))

dates_arr = []
for i in range(len(dates)-1):
    dates_arr.append(dates[i:i+2]) 
    
stay_leave = pd.DataFrame({"username": df.username.unique()})

for username in tqdm.tqdm(range(len(stay_leave))):
    
    tweeted_all_months = [None]*len(dates_arr)
    geotagged_all_months = [None]*len(dates_arr)
    
    average_tweets_per_month = [None]*len(dates_arr)
    average_geotagged_per_month = [None]*len(dates_arr)
    
    for i in range(len(dates_arr)):
        
        date = dates_arr[i]
        start_date = date[0]
        end_date = date[1]
        
        mask = (df.username == username) & (df.DateTime > start_date) & (df.DateTime <= end_date)
        
        tweets_this_month = df[mask]
        geotagged_this_month = df[mask & (pd.notnull(df.place_full_name))]
        
        tweeted_all_months[i] = tweets_this_month.shape[0] > 0
        geotagged_all_months[i] = geotagged_this_month.shape[0] > 0
        
        average_tweets_per_month[i] = tweets_this_month.shape[0]
        average_geotagged_per_month[i] = geotagged_this_month.shape[0]
        
    # people who tweeted at least once a month
    stay_leave.loc[stay_leave.username == username, "tweeted_all_weeks"] = tweeted_all_months == True
    stay_leave.loc[stay_leave.username == username, "geotagged_all_weeks"] = geotagged_all_months == True
    
    # number of average tweets per month per user
    stay_leave.loc[stay_leave.username == username, "mean_tweets_per_week"] = np.mean(average_tweets_per_month)
    stay_leave.loc[stay_leave.username == username, "mean_geotagged_per_week"] = np.mean(average_geotagged_per_month)
    
    # number of median tweets per month per user
    #stay_leave.loc[stay_leave.username == username, "median_tweets_per_week"] = np.median(average_tweets_per_month)
    #stay_leave.loc[stay_leave.username == username, "median_geotagged_per_week"] = np.median(average_geotagged_per_month)
""" 

'\n\nimport numpy as np\nimport tqdm\n\ndates = pd.to_datetime(pd.date_range(start=\'2015-09-27\', end=\'2016-11-06\', freq=\'W\', tz="UTC"))\n\ndates_arr = []\nfor i in range(len(dates)-1):\n    dates_arr.append(dates[i:i+2]) \n    \nstay_leave = pd.DataFrame({"username": df.username.unique()})\n\nfor username in tqdm.tqdm(range(len(stay_leave))):\n    \n    tweeted_all_months = [None]*len(dates_arr)\n    geotagged_all_months = [None]*len(dates_arr)\n    \n    average_tweets_per_month = [None]*len(dates_arr)\n    average_geotagged_per_month = [None]*len(dates_arr)\n    \n    for i in range(len(dates_arr)):\n        \n        date = dates_arr[i]\n        start_date = date[0]\n        end_date = date[1]\n        \n        mask = (df.username == username) & (df.DateTime > start_date) & (df.DateTime <= end_date)\n        \n        tweets_this_month = df[mask]\n        geotagged_this_month = df[mask & (pd.notnull(df.place_full_name))]\n        \n        tweeted_all_months[i] = tweets_this_

In [None]:
# number of people who tweeted at least once a week
stay_leave.tweeted_all_weeks.value_counts(dropna=False)

In [None]:
stay_leave.geotagged_all_weeks.value_counts(dropna=False)

In [None]:
# number of average tweets per week per user
#stay_leave.mean_tweets_per_week.value_counts(dropna=False)
print("Table ___ shows the mean and median numbers of total tweets and geotagged tweets per user per week.")
print()
print("& Total & Geotagged \\")
print("Mean & ", np.mean(stay_leave.mean_tweets_per_week), " & ", np.mean(stay_leave.mean_geotagged_per_week), " \\")
print("Median & ", np.median(stay_leave.mean_tweets_per_week), " & ", np.median(stay_leave.mean_geotagged_per_week), " \\")