# Stalking Users

In [2]:
import pandas as pd
import tqdm

# combining all into one dataframe for analysis

dfs = []

for i in tqdm.tqdm(range(0, 112)):
    df = pd.read_feather("s3://mt5599/final/processed_tweets_" + str(i) + ".feather")

    dfs.append(df)

100%|██████████| 112/112 [24:15<00:00, 13.00s/it]


In [None]:
df = pd.concat(dfs, axis=0, ignore_index=True)
df = df.drop_duplicates("id").reset_index()
df.to_feather("s3://mt5599/dissertation/stalking_users.feather")
df.shape[0]

In [None]:
# count how many tweets have a mentioned location

df0 = pd.concat(dfs, axis=0, ignore_index=True)
df0 = df0[df0.ner_type == "LOC"]
df0 = df0.drop_duplicates("id").reset_index()
df0
#df.to_feather("s3://mt5599/dissertation/stalking_users.feather")
#df.shape[0]

In [None]:
df.columns

In [None]:
df.lang.value_counts(dropna=False)

In [None]:
print("There were a total of", df.shape[0], "tweets from ", len(df.username.unique()),
      "users whose tweets were collected and processed.",
     df[pd.notnull(df.place_full_name)].shape[0], " of those tweets were geotagged, ",
    df0.shape[0], "had a location mention, ",
     df0[pd.notnull(df0.gmaps_lat)].shape[0], "had a geoparsed location.")

In [None]:
summary_table = df.groupby(['username'])["id"].count().reset_index(name='total_tweets')
summary_table_geotagged = df[pd.notnull(df.place_full_name)].groupby(['username'])["id"].count().reset_index(name='geotagged_tweets')
summary_table_mentioned = df0.groupby(["username"])["id"].count().reset_index(name="tweets_with_LOC")
summary_table_geoparsed = df0[pd.notnull(df0.gmaps_address)].groupby(["username"])["id"].count().reset_index(name="geoparsed_tweets")

In [None]:
s = summary_table.merge(summary_table_geotagged, how="left", on="username")
s = s.merge(summary_table_mentioned, how="left", on="username")
s = s.merge(summary_table_geoparsed, how="left", on="username")
s = s.fillna(0)
s

In [None]:
import numpy as np

print("Over the relevant time period, Table  shows the summary statistics of tweets per user.")
print()
print("Mean & ", np.mean(summary_table.total_tweets),
      " & ", np.mean(summary_table_geotagged.geotagged_tweets),
      " & ", np.mean(summary_table_mentioned.tweets_with_LOC),
      " & ", np.mean(summary_table_geoparsed.geoparsed_tweets))
print("Median & ", np.median(summary_table.total_tweets),
      " & ", np.median(summary_table_geotagged.geotagged_tweets),
      " & ", np.median(summary_table_mentioned.tweets_with_LOC),
      " & ", np.median(summary_table_geoparsed.geoparsed_tweets))
print("Max & ", np.max(summary_table.total_tweets),
      " & ", np.max(summary_table_geotagged.geotagged_tweets),
      " & ", np.max(summary_table_mentioned.tweets_with_LOC),
      " & ", np.max(summary_table_geoparsed.geoparsed_tweets))
print("Min & ", np.min(summary_table.total_tweets),
      " & ", np.min(summary_table_geotagged.geotagged_tweets),
      " & ", np.min(summary_table_mentioned.tweets_with_LOC),
      " & ", np.min(summary_table_geoparsed.geoparsed_tweets))
print("Std & ", np.std(summary_table.total_tweets),
      " & ", np.std(summary_table_geotagged.geotagged_tweets),
      " & ", np.std(summary_table_mentioned.tweets_with_LOC),
      " & ", np.std(summary_table_geoparsed.geoparsed_tweets))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set figure size and style
sns.set(rc={'figure.figsize':(10,10)})
#sns.set_style(style={'axes.edgecolor': 'black', 'axes.facecolor': 'white'})

fig, axes = plt.subplots(4, 1)
fig.suptitle('Distribution of Tweets')

# Plot the density plot of the "distance" column from ddf_duplicates
sns.kdeplot(data = s["total_tweets"], ax = axes[0])
sns.kdeplot(data = s["geotagged_tweets"], ax = axes[1])
sns.kdeplot(data = s["tweets_with_LOC"], ax = axes[2])
sns.kdeplot(data = s["geoparsed_tweets"], ax = axes[3])

# Plot the density plot of the "distance" column from ddf_noduplicates
#sns.distplot(ddf_noduplicates['distance'], color='red', label='No Duplicates', kde=False)

# Rename axis labels
#plt.xlabel('Distance (km)')
#plt.ylabel('Density')

# Add legend

fig.text(0.5, 0.01, 'Number of Tweets', ha='center')
fig.text(0.01, 0.5, 'Frequency', va='center', rotation='vertical')

#plt.legend()
plt.savefig('user_tweets.png')
plt.show()

In [None]:
import pandas as pd

# users that tweeted from Argentina
argentina = df[df.place_country_code == "AR"].groupby(['username'])["id"].count().reset_index(name='counts_1')

all_ = df.groupby(['username'])["id"].count().reset_index(name='counts_2')

only_argentina = pd.merge(argentina, all_, how="left", on="username")
only_argentina["only_argentina"] = np.where(only_argentina["counts_1"] == only_argentina["counts_2"], "only AR", "elsewhere")
only_argentina = only_argentina[only_argentina.only_argentina == "only AR"]

print("Shape of df: ", only_argentina.shape)
print()
print("The number of users that tweeted only from Argentina was ", only_argentina.shape[0],
      ", which is ", only_argentina.shape[0] / len(all_.username.unique()) * 100,"% of the total users.",
     "This is interesting as they could have still travelled within Argentina.")

In [None]:
"""

import numpy as np
import tqdm

dates = pd.to_datetime(pd.date_range(start='2015-09-27', end='2016-11-06', freq='W', tz="UTC"))

dates_arr = []
for i in range(len(dates)-1):
    dates_arr.append(dates[i:i+2]) 
    
stay_leave = pd.DataFrame({"username": df.username.unique()})

for username in tqdm.tqdm(range(len(stay_leave))):
    
    tweeted_all_months = [None]*len(dates_arr)
    geotagged_all_months = [None]*len(dates_arr)
    
    average_tweets_per_month = [None]*len(dates_arr)
    average_geotagged_per_month = [None]*len(dates_arr)
    
    for i in range(len(dates_arr)):
        
        date = dates_arr[i]
        start_date = date[0]
        end_date = date[1]
        
        mask = (df.username == username) & (df.DateTime > start_date) & (df.DateTime <= end_date)
        
        tweets_this_month = df[mask]
        geotagged_this_month = df[mask & (pd.notnull(df.place_full_name))]
        
        tweeted_all_months[i] = tweets_this_month.shape[0] > 0
        geotagged_all_months[i] = geotagged_this_month.shape[0] > 0
        
        average_tweets_per_month[i] = tweets_this_month.shape[0]
        average_geotagged_per_month[i] = geotagged_this_month.shape[0]
        
    # people who tweeted at least once a month
    stay_leave.loc[stay_leave.username == username, "tweeted_all_weeks"] = tweeted_all_months == True
    stay_leave.loc[stay_leave.username == username, "geotagged_all_weeks"] = geotagged_all_months == True
    
    # number of average tweets per month per user
    stay_leave.loc[stay_leave.username == username, "mean_tweets_per_week"] = np.mean(average_tweets_per_month)
    stay_leave.loc[stay_leave.username == username, "mean_geotagged_per_week"] = np.mean(average_geotagged_per_month)
    
    # number of median tweets per month per user
    #stay_leave.loc[stay_leave.username == username, "median_tweets_per_week"] = np.median(average_tweets_per_month)
    #stay_leave.loc[stay_leave.username == username, "median_geotagged_per_week"] = np.median(average_geotagged_per_month)
""" 

In [None]:
# number of people who tweeted at least once a week
stay_leave.tweeted_all_weeks.value_counts(dropna=False)

In [None]:
stay_leave.geotagged_all_weeks.value_counts(dropna=False)

In [None]:
# number of average tweets per week per user
#stay_leave.mean_tweets_per_week.value_counts(dropna=False)
print("Table ___ shows the mean and median numbers of total tweets and geotagged tweets per user per week.")
print()
print("& Total & Geotagged \\")
print("Mean & ", np.mean(stay_leave.mean_tweets_per_week), " & ", np.mean(stay_leave.mean_geotagged_per_week), " \\")
print("Median & ", np.median(stay_leave.mean_tweets_per_week), " & ", np.median(stay_leave.mean_geotagged_per_week), " \\")