# Y\*Int Social Media Analysis
#### Ester Zhao

In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
import plotly.graph_objects as go

In [101]:
url = 'https://raw.githubusercontent.com/data-challengers/DC5/main/DC5-Data/YInt%20Social%20Media%20Data/YInt.csv'
df = pd.read_csv(url, parse_dates=[0], infer_datetime_format=True)
df.head()

Unnamed: 0,time,location,account,message
0,2020-04-06 00:00:00,Weston,Opportunities2,"Take advantheeseage of theesehese One, theeser..."
1,2020-04-06 00:00:00,Southton,LazyBCouch,@WatchesThomasBird fork it you're back in he s...
2,2020-04-06 00:02:00,Southton,______3333_____,Two month left bedoreefore you loose this vibe...
3,2020-04-06 00:04:00,Broadview,RasoHorse49,"billeeeeer, i miss ytouou !"
4,2020-04-06 00:07:00,West Parton,CuriousPlateBobbie_Mallon,You obviously need to use rumble! #rumble #toW...


In [102]:
# Create string parser for df
def find_string_in_messages(df, string_to_find):
    subset_df = df[df['message'].str.contains(string_to_find, na=False)]
    subset_df.reset_index()
    return subset_df

In [103]:
# Get list of neighborhoods being used in Yint dataset
neighborhoods = df['location'].unique()
print('St Himark Neighborhoods: ', len(neighborhoods), '\n',neighborhoods)

# Get subset of messages sent from outside neighborhoods
other_neighborhoods = df[df['location'].isin(['UNKNOWN', '<Location with-held due to contract>'])]
non_neighborhood_users = other_neighborhoods['account'].unique()
# print(non_neighborhood_users)

St Himark Neighborhoods:  21 
 ['Weston' 'Southton' 'Broadview' 'West Parton' 'Old Town'
 'Terrapin Springs' 'Downtown' 'Southwest' 'Scenic Vista' 'East Parton'
 'Cheddarford' 'Palace Hills' 'Safe Town' 'Easton' 'Chapparal' 'Northwest'
 'Oak Willow' 'Pepper Mill' 'Wilson Forest' 'UNKNOWN'
 '<Location with-held due to contract>']


In [104]:
def fill_df_nas(df, time_col, group_col, group_arr):
    """
    Expands dataframe to include all x-axis values for every group, and
    fills dataframes with NAs when there are no observations for the specified group.
    
    Modified function that includes NA values for all time points within range from min-max times observed.
    
    Useful for Plotly graphs in mode='lines+markers'
    :param: df: dataframe of interest
    :param: time_col: string name of column that contains time variable (or generally, the x variable)
    :param: group_col: string name of column that contains the groups to plot over different traces
    :param: group_arr: list or numpy array of all unique observations in df['group_col']
    """
    # Sort by time for graphing
    df = df.sort_values(by=[time_col])
    # Get series time range between min and max time points
    time_range = pd.date_range(df[time_col].min(), df[time_col].max(), freq='H')
    
    df_series = pd.Series(np.tile(group_arr, len(time_range)))
    df_idx_series = time_range \
        .repeat(len(group_arr))
    new_df = pd.DataFrame({time_col: df_idx_series,
                          group_col: df_series})
    df_with_nas = pd.merge(new_df, df, on=[time_col, group_col], how='left')
    return df_with_nas


In [105]:
# Round times to the hour (to be grouped for plotting)
def create_message_counts(df):
    df['hour'] = df['time'].dt.floor('h')
    # Create count aggregate of classification appearances over time
    grouped_df = df.groupby(['location', 'hour']).size().reset_index(name='count')
    return grouped_df


In [106]:
# Plot frequency of messaging from neighborhoods over time
def plot_messages_over_time(df, title):
    fig = go.Figure()
    # plot all classifications as different lines
    for i in range(0, len(neighborhoods)):
        # filter for specific classification
        expr = df['location'] == neighborhoods[i]
        fig.add_trace(go.Scatter(
            x=df[expr]['hour'],
            y=df[expr]['count'],
            name=neighborhoods[i],
            connectgaps=False,
            mode='lines')
        )
        
    fig.update_layout(
        title_text=title,
        xaxis_title="Time",
        yaxis_title="Count",
        legend_title="Neighborhood")
    
    fig.show()

# Create plot
grouped_df = create_message_counts(df)
fig_df = fill_df_nas(grouped_df, 'hour', 'location', neighborhoods)
plot_messages_over_time(fig_df, 'Y*Int Messages By Neighborhood')

### Rumble bots

In [107]:
# Create subset that mention the rumble app (suspected bot messages)
rumble_df = find_string_in_messages(df, "rumble")
# print(rumble_df)
rumble_bots = rumble_df['account'].unique()
print(rumble_bots)
rumble_neighborhoods = rumble_df['location'].unique()

['CuriousPlateBobbie_Mallon' 'Grant1953Rapp' 'FastBBanana40'
 'Marisela_KozmaMouse1991' 'Manuel_OrrLion64' 'Wilbur1994B'
 'FuriousJohnsonCouch' 'TallWoodyHouse' 'FearsCowerWhale' 'DarkGCar'
 'BillyBird57' 'CuriousIcecreamFrank' 'LazyAPlate' 'Tonya1989Ross'
 'GCandidTerry' 'HoldsFrank_CasonBear' 'GuapoHelenCouch'
 'Annie2002Leavitt' 'AttentativeAllenLight2002' 'FleetGladwinPlate'
 'ObnoxiousJohnsonPlate70' 'SmartFBowl12' 'RacesCraigCat'
 'CleverCandyHugh_Vernon' 'DarkRBanana' 'RacesAnnabelle_DewittHorse'
 'WatchesMichael_ObrienMouse' 'AttentativeRobertBanana'
 'MarthaGuapoHolland' 'CleverBeverlyCandy' 'FigueroaOctopus1959'
 'HWileySoares' 'CeravoloWhale1992' 'FamousThomasIcecream' 'Daniel1971'
 'Anthony1996Foster' 'Eugenia1955' 'FleetGibsonCar1970' 'IssacMouse1995'
 'WatchesBruce_ParkerBird' 'Catherin_Brown32' 'CandidLBread'
 'FamousOCar1984' 'FearsMatthewDog' 'JonesCat1952' 'LovesJudy_DavesBird'
 'WileyLCandy' 'CraigDog26' 'FamousIlaBanana' 'FastBananaDickerson'
 'Jeffrey_Chynoweth2002

In [108]:
# Create plot of rumble messages by neighborhood, see if they all come from the same place
rumble_grouped_df = create_message_counts(rumble_df)
fig_df = fill_df_nas(rumble_grouped_df, 'hour', 'location', rumble_neighborhoods)
plot_messages_over_time(fig_df, 'Y*Int #rumble Messages by Neighborhood')

# Doesn't look like they are all centered on a specific neighborhood



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Other bots

In [109]:
# Create a sum of total messages per user to isolate bots and remove them from the dataset
def get_account_message_counts(df):
    account_sums = df.groupby(['account']).size().reset_index(name='count')
    account_sums = account_sums.sort_values(by=['count'], ascending=False)
    return account_sums

account_sums = get_account_message_counts(df)
potential_bots = account_sums[:10]
potential_bots = potential_bots['account'].unique()
print('Potential bots: ', potential_bots)

def get_user_csv(df, user):
    mask = df.loc[df['account'] == user] 
    filename = user + '.csv'
    mask.to_csv(filename, index=False)
    return mask

# look at high quantity posters to check what they say
for user in potential_bots:
    get_user_csv(df, user)


Potential bots:  ['DerekNolan' 'Syndicated4' 'ChloeJohnson' '______3333_____'
 'Syndicated348' 'CantonCoordon2' 'CassieStones' 'Syndicated5'
 'Opportunities2' 'JordanWantsBac0n']


In [110]:
# Many of the "deals" bots use language that coincides with earthquake-related phrases
# Eliminate them from the dataset to get actual earthquake messages

deals_df = find_string_in_messages(df, 'deals')
deals_accounts = deals_df['account'].unique()
# Remove users that don't seem like bots
boolean_series = df['account'].isin(deals_accounts)
deals_df = df[boolean_series]
account_sums = get_account_message_counts(deals_df)
# print(account_sums)
# Did some spot checks, doesn't look like the last 3 are bots
bots = account_sums.loc[account_sums['count'] > 50] 
print(bots)
# Get list of bot account names
bot_accounts = bots['account'].unique()


# Create a df that doesn't include bots
data_with_index = df.set_index("account")
cleaned_df = data_with_index.drop(bot_accounts)
print(cleaned_df)

             account  count
9        Syndicated4    120
11   ______3333_____    103
8      Syndicated348    102
2     CantonCoordon2     88
10       Syndicated5     78
6     Opportunities2     75
4   JordanWantsBac0n     73
5     Opportunities1     72
3   J0rdanWantsBacon     65
12            handle     58
                                         time          location  \
account                                                           
LazyBCouch                2020-04-06 00:00:00          Southton   
RasoHorse49               2020-04-06 00:04:00         Broadview   
CuriousPlateBobbie_Mallon 2020-04-06 00:07:00       West Parton   
Moore1961                 2020-04-06 00:11:00          Old Town   
AttentativeKHouse         2020-04-06 00:11:00  Terrapin Springs   
...                                       ...               ...   
DerekNolan                2020-04-10 11:59:00       Cheddarford   
WileyPlateHunter          2020-04-10 11:59:00         Safe Town   
LazyCBowl             

In [111]:
# Group all these and then figure out whats going on 
# 'I am annoyed by my [adjective] neighbor' (some typos!), #FTWneighbor
# 'I hate all the [adjective] [adjective]s lying around...'

### Earthquake Activity

In [112]:
# 'quaking', 'aftershock', 'quake', 'earthquake', rumble'
# topic grouping

In [None]:
# NEXT STEPS:
# create a frequency counter for topics/phrases to find most popular phrases? how to do this
# create a graphic to show frequency of topic over time
# goal to get a nice timeline of events (earthquakes, bridges down, ferrets?!)