In [4]:
import pandas as pd
import plotly.express as px
import scipy.stats as ss
import matplotlib.pyplot as plt
import numpy as np
import imgkit

tweetsdf = pd.read_csv('/Users/leemingi/Downloads/finalScores.csv').dropna(subset=['score'])
tweetsdf = tweetsdf.iloc[:,2:]
tweetsdf.dropna(subset=['hashtags'])
# Converting date field to datetime object
tweetsdf['date'] = pd.to_datetime(tweetsdf['date'])

In [45]:
# Dates/Starts of positive events
negativeEventDates = {
    'flood': pd.Timestamp('2009-09-18'),
    'winter-storm': pd.Timestamp('2009-12-19'),
    'appomattrox': pd.Timestamp('2010-01-19'),
    'fort-hood': pd.Timestamp('2009-11-05')
}

def plotAvgHappiness(event, keywords, eventName, offset:int = 14, tweetsdf:pd.DataFrame =  tweetsdf, negativeEventDates:dict = negativeEventDates):
    dateOffset = pd.DateOffset(days=offset)
    # Filtering for tweets that are within two weeks of event
    filteredTweets = tweetsdf[(tweetsdf['date'] >= (negativeEventDates[event] - dateOffset)) & (tweetsdf['date'] <= (negativeEventDates[event] + dateOffset))]
    newDF = filteredTweets.copy()
    newDF.loc[:,'containsKeywords'] = newDF['original_tweet'].str.contains('|'.join(keywords) if len(keywords) > 1 else keywords[0])
    newDF.loc[:, 'all'] = 1  # Add a constant column

    aggTweets = newDF.groupby(['containsKeywords', pd.Grouper(key='date', freq='D')]).mean(numeric_only=True)
    aggTweets = aggTweets.reset_index().set_index('date')

    allTweets = newDF.groupby(pd.Grouper(key='date', freq='D')).mean(numeric_only=True)
    allTweets['containsKeywords'] = 'All'  # Add a label for the new line

    # Combine the DataFrames and sort by date
    aggTweets = pd.concat([aggTweets, allTweets]).sort_index()

    # Plotting distribution of tweet scores around date
    plot = px.line(
        data_frame=aggTweets,
        x=aggTweets.index,
        y=aggTweets.score,
        color=aggTweets['containsKeywords'],
        title=f'Average Happiness Score for {eventName}',
        labels = {'date':'Date','score':'Avg. Happiness Score', 'containsKeywords': 'Contains Keywords'}
    )
    plot.add_vline(x=negativeEventDates[event])


    return plot

def plotAvgHappinessGeo(event, keywords, eventName, location, offset:int = 14, tweetsdf:pd.DataFrame =  tweetsdf, negativeEventDates:dict = negativeEventDates):
    dateOffset = pd.DateOffset(days=offset)
    # Filtering for tweets that are within two weeks of event
    filteredTweets = tweetsdf[(tweetsdf['date'] >= (negativeEventDates[event] - dateOffset)) & (tweetsdf['date'] <= (negativeEventDates[event] + dateOffset))]
    filteredTweets = filteredTweets[(filteredTweets['latitude'] >= location[0])
                            & (filteredTweets['latitude'] <= location[1])
                            & (filteredTweets['longitude'] <= location[2])
                            & (filteredTweets['longitude'] >= location[3])]

    newDF = filteredTweets.copy()
    newDF.loc[:,'containsKeywords'] = newDF['original_tweet'].str.contains('|'.join(keywords) if len(keywords) > 1 else keywords[0])
    newDF.loc[:, 'all'] = 1  # Add a constant column

    aggTweets = newDF.groupby(['containsKeywords', pd.Grouper(key='date', freq='D')]).mean(numeric_only=True)
    aggTweets = aggTweets.reset_index().set_index('date')

    allTweets = newDF.groupby(pd.Grouper(key='date', freq='D')).mean(numeric_only=True)
    allTweets['containsKeywords'] = 'All'  # Add a label for the new line

    # Combine the DataFrames and sort by date
    aggTweets = pd.concat([aggTweets, allTweets]).sort_index()

    # Plotting distribution of tweet scores around date
    plot = px.line(
        data_frame=aggTweets,
        x=aggTweets.index,
        y=aggTweets.score,
        color=aggTweets['containsKeywords'],
        title=f'Average Happiness Score for {eventName} in {location[4]}',
        labels = {'date':'Date','score':'Avg. Happiness Score', 'containsKeywords': 'Contains Keywords'}
    )
    plot.add_vline(x=negativeEventDates[event])


    return plot



def performTtest(event, offset:int = 14, tweetsdf:pd.DataFrame =  tweetsdf, negativeEventDates:dict = negativeEventDates):
    dateOffset = pd.DateOffset(days=offset)

    # Group 1 = pre
    # Group 2 = post
    group1 = tweetsdf[(tweetsdf['date'] >= (negativeEventDates[event] - dateOffset)) & (tweetsdf['date'] < negativeEventDates[event])]['score']
    group2 = tweetsdf[(tweetsdf['date'] <= (negativeEventDates[event] + dateOffset)) & (tweetsdf['date'] > negativeEventDates[event])]['score']

    # Check for normality assumption using Shapiro-Wilk test
    stat1, p1 = ss.shapiro(group1)
    stat2, p2 = ss.shapiro(group2)
    alpha = 0.05

    if p1 > alpha and p2 > alpha:
        print('Both samples are normally distributed.')
    else:
        print('At least one sample is not normally distributed. However, sample size is large enough to ignore.')

    # Check for equal variance assumption using Levene's test
    stat, p = ss.levene(group1, group2)
    if p > alpha:
        var = True
        print('Variances are equal.')
    else:
        var = False
        print('Variances are not equal.')

    # Perform t-test or welsch t-test assuming 
    t, p_final = ss.ttest_ind(group1, group2, equal_var=False)
    if p > alpha:
        print(f'There is no significant difference between the groups. p = {p_final}. var = {var}')
    else:
        print(f'There is a significant difference between the groups. p = {p_final}. var = {var}')

    return (p_final, (group1.mean(), len(group1)), (group2.mean(), len(group2)))



def performUtest(event, offset:int = 14, tweetsdf:pd.DataFrame =  tweetsdf, negativeEventDates:dict = negativeEventDates):
    dateOffset = pd.DateOffset(days=offset)

    # Group 1 = pre
    # Group 2 = post
    group1 = tweetsdf[(tweetsdf['date'] >= (negativeEventDates[event] - dateOffset)) & (tweetsdf['date'] < negativeEventDates[event])]['score']
    group2 = tweetsdf[(tweetsdf['date'] <= (negativeEventDates[event] + dateOffset)) & (tweetsdf['date'] > negativeEventDates[event])]['score']

    # Perform Mann-Whitney U test
    statistic, p_final = ss.mannwhitneyu(group1, group2)

    return (p_final, (group1.mean(), len(group1)), (group2.mean(), len(group2)))




# 2009 September Flood

In [41]:
# Defining event and keyword
event = 'flood' 
keyword = 'flood'
dateOffset = pd.DateOffset(days=14)
keywords = ['flood']
title = 'flood'

# South East US latitudes and longitudes
location = [30.5, 35.5, -78.5, -92.5, 'South East US']

# Defining where to store plots
plotFile = '../plots/flood'
figW = 800
figH = 600

# Plotting average happiness around 28-day period for all data
plotAvgHappiness(event, keywords, title, 14).write_image(f"{plotFile}/avgHap28day.png", format="png", width=figW, height=figH)
plotAvgHappinessGeo(event, keywords, title, location, 14).write_image(f"{plotFile}/avgHap28daybyLocation.png", format="png", width=figW, height=figH)


# Graphing keyword freq 
## Whole dataset
keywordCountsDF  = pd.DataFrame({'date': tweetsdf['date'],'bool':tweetsdf['original_tweet'].str.contains('|'.join(keywords) if len(keywords)>1 else keywords[0])}).groupby(pd.Grouper(key='date', freq='D')).sum()

# Plotting keyword freq 
px.line(
    data_frame=keywordCountsDF,
    x=keywordCountsDF.index,
    y=keywordCountsDF['bool'],
    labels={'bool': 'Frequency', 'date': 'Date'}
).write_image(f"{plotFile}/freqWhole.png", format="png", width=figW, height=figH)

## Just 28 days
filteredKeywordCountDF = keywordCountsDF.loc[(keywordCountsDF.index >= (negativeEventDates[event] - dateOffset)) & (keywordCountsDF.index <= negativeEventDates[event] + dateOffset)]

# Plotting keyword freq around date
px.line(
    data_frame=filteredKeywordCountDF,
    x=filteredKeywordCountDF.index,
    y=filteredKeywordCountDF['bool'],
    labels={'bool': 'Frequency', 'date': 'Date'}
).write_image(f"{plotFile}/freq28Day.png", format="png", width=figW, height=figH)

# Performing Welsch t-Tests on all data
## 28-day
t28pValue, t28preData, t28postData = performTtest(event, 14)
t28diff = t28postData[0] - t28preData[0]

## 14-day
t14pValue, t14preData, t14postData = performTtest(event, 7)
t14diff = t14postData[0] - t14preData[0]

## 6-day
t6pValue, t6preData, t6postData = performTtest(event, 3)
t6diff = t6postData[0] - t6preData[0]

# Performing Mann-Whitney u-Tests on all data
## 28-day
u28pValue, u28preData, u28postData = performUtest(event, 14)
u28diff = u28postData[0] - u28preData[0]

## 14-day
u14pValue, u14preData, u14postData = performUtest(event, 7)
u14diff = u14postData[0] - u14preData[0]

## 6-day
u6pValue, u6preData, u6postData = performUtest(event, 3)
u6diff = u6postData[0] - u6preData[0]

# Creating table 
tableData = {
    'Time Period': ['28 days', '14 days', '6 days'], 
    'Pre-Group Avg. Happiness': [t28preData[0], t14preData[0], t6preData[0]],
    'Post-Group Avg. Happiness': [t28postData[0], t14postData[0], t6postData[0]],
    'Difference in Avg. Happiness': [t28diff, t14diff, t6diff],
    'Sample Size Pre-Group': [t28preData[1], t14preData[1], t6preData[1]],
    'Sample Size Post-Group': [t28postData[1], t14postData[1], t6postData[1]],
    'Welch\'s t-Test p-value': [t28pValue, t14pValue, t6pValue],
    'Mann-Whitney U test p-value': [u28pValue, u14pValue, u6pValue],
}

table = pd.DataFrame(tableData)
styledTable = table.style.format({'Welch\'s t-Test p-value': '{:.2e}', 'Mann-Whitney U test p-value': '{:.2e}'})
styledTable = styledTable.set_properties(**{
    'text-align': 'center',
    'font-size': '14pt',
    'font-family': 'Arial, sans-serif',
    'border-collapse': 'collapse',
    'border': '1px solid #ddd',
    'background-color': '#f7f7f7',
    'color': '#333',
    'padding': '10px',
}).set_table_styles([{    'selector': 'th',    'props': [        ('background-color', '#4CAF50'),        ('color', 'white'),        ('border-top', '1px solid #ddd'),        ('border-bottom', '1px solid #ddd'),        ('font-weight', 'bold'),        ('padding', '10px')    ]
}, {
    'selector': 'td',
    'props': [
        ('border-top', '1px solid #ddd'),
        ('border-bottom', '1px solid #ddd'),
        ('padding', '10px')
    ]
}])

html = styledTable.render()

# Save HTML as PNG image using imgkit
imgkit.from_string(html, f'{plotFile}/{event}AllData.png')

# Performing Welsch t-Tests on filtered data
filteredTweetsDF = tweetsdf[tweetsdf['original_tweet'].str.contains('|'.join(keywords) if len(keywords)>1 else keywords[0])]
## 28-day
t28pValue, t28preData, t28postData = performTtest(event, 14, tweetsdf=filteredTweetsDF)
t28diff = t28postData[0] - t28preData[0]

## 14-day
t14pValue, t14preData, t14postData = performTtest(event, 7, tweetsdf=filteredTweetsDF)
t14diff = t14postData[0] - t14preData[0]

## 6-day
t6pValue, t6preData, t6postData = performTtest(event, 3, tweetsdf=filteredTweetsDF)
t6diff = t6postData[0] - t6preData[0]

# Performing Mann-Whitney u-Tests on all data
## 28-day
u28pValue, u28preData, u28postData = performUtest(event, 14, tweetsdf=filteredTweetsDF)
u28diff = u28postData[0] - u28preData[0]

## 14-day
u14pValue, u14preData, u14postData = performUtest(event, 7, tweetsdf=filteredTweetsDF)
u14diff = u14postData[0] - u14preData[0]

## 6-day
u6pValue, u6preData, u6postData = performUtest(event, 3, tweetsdf=filteredTweetsDF)
u6diff = u6postData[0] - u6preData[0]

# Creating table 
tableData = {
    'Time Period': ['28 days', '14 days', '6 days'], 
    'Pre-Group Avg. Happiness': [t28preData[0], t14preData[0], t6preData[0]],
    'Post-Group Avg. Happiness': [t28postData[0], t14postData[0], t6postData[0]],
    'Difference in Avg. Happiness': [t28diff, t14diff, t6diff],
    'Sample Size Pre-Group': [t28preData[1], t14preData[1], t6preData[1]],
    'Sample Size Post-Group': [t28postData[1], t14postData[1], t6postData[1]],
    'Welch\'s t-Test p-value': [t28pValue, t14pValue, t6pValue],
    'Mann-Whitney U test p-value': [u28pValue, u14pValue, u6pValue],
}

table = pd.DataFrame(tableData)
styledTable = table.style.format({'Welch\'s t-Test p-value': '{:.2e}', 'Mann-Whitney U test p-value': '{:.2e}'})
styledTable = styledTable.set_properties(**{
    'text-align': 'center',
    'font-size': '14pt',
    'font-family': 'Arial, sans-serif',
    'border-collapse': 'collapse',
    'border': '1px solid #ddd',
    'background-color': '#f7f7f7',
    'color': '#333',
    'padding': '10px',
}).set_table_styles([{    'selector': 'th',    'props': [        ('background-color', '#4CAF50'),        ('color', 'white'),        ('border-top', '1px solid #ddd'),        ('border-bottom', '1px solid #ddd'),        ('font-weight', 'bold'),        ('padding', '10px')    ]
}, {
    'selector': 'td',
    'props': [
        ('border-top', '1px solid #ddd'),
        ('border-bottom', '1px solid #ddd'),
        ('padding', '10px')
    ]
}])

html = styledTable.render()
# Save HTML as PNG image using imgkit
imgkit.from_string(html, f'{plotFile}/{event}FilteredData.png')


p-value may not be accurate for N > 5000.



At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are not equal.
There is a significant difference between the groups. p = 0.4364986242278558. var = False
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are not equal.
There is a significant difference between the groups. p = 0.316726412839658. var = False
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are not equal.
There is a significant difference between the groups. p = 0.025287423648294508. var = False



this method is deprecated in favour of `Styler.to_html()`



Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are not equal.
There is a significant difference between the groups. p = 0.012726834591608895. var = False
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 0.09376361250339224. var = True
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 0.25405611145124685. var = True



this method is deprecated in favour of `Styler.to_html()`



Loading page (1/2)


True

# 2009 December Winter Storm

In [50]:

# Defining event and keyword
event = 'winter-storm' 
dateOffset = pd.DateOffset(days=14)
keywords = ['storm']
title = 'Winter Storm'

# South East US
location = [38.5, 39.5, -76.5, -77.5, 'Washington DC']


# Defining where to store plots
plotFile = '../plots/winter-storm'
figW = 800
figH = 600

# Plotting average happiness around 28-day period for all data
plotAvgHappiness(event, keywords, title, 14).write_image(f"{plotFile}/avgHap28day.png", format="png", width=figW, height=figH)
plotAvgHappinessGeo(event, keywords, title, location, 14).write_image(f"{plotFile}/avgHap28daybyLocation.png", format="png", width=figW, height=figH)

# Graphing keyword freq 
## Whole dataset
keywordCountsDF  = pd.DataFrame({'date': tweetsdf['date'],'bool':tweetsdf['original_tweet'].str.contains('|'.join(keywords) if len(keywords)>1 else keywords[0])}).groupby(pd.Grouper(key='date', freq='D')).sum()

# Plotting keyword freq 
px.line(
    data_frame=keywordCountsDF,
    x=keywordCountsDF.index,
    y=keywordCountsDF['bool'],
    labels={'bool': 'Frequency', 'date': 'Date'}
).write_image(f"{plotFile}/freqWhole.png", format="png", width=figW, height=figH)

## Just 28 days
filteredKeywordCountDF = keywordCountsDF.loc[(keywordCountsDF.index >= (negativeEventDates[event] - dateOffset)) & (keywordCountsDF.index <= negativeEventDates[event] + dateOffset)]

# Plotting keyword freq around date
px.line(
    data_frame=filteredKeywordCountDF,
    x=filteredKeywordCountDF.index,
    y=filteredKeywordCountDF['bool'],
    labels={'bool': 'Frequency', 'date': 'Date'}
).write_image(f"{plotFile}/freq28Day.png", format="png", width=figW, height=figH)

# Performing Welsch t-Tests on all data
## 28-day
t28pValue, t28preData, t28postData = performTtest(event, 14)
t28diff = t28postData[0] - t28preData[0]

## 14-day
t14pValue, t14preData, t14postData = performTtest(event, 7)
t14diff = t14postData[0] - t14preData[0]

## 6-day
t6pValue, t6preData, t6postData = performTtest(event, 3)
t6diff = t6postData[0] - t6preData[0]

# Performing Mann-Whitney u-Tests on all data
## 28-day
u28pValue, u28preData, u28postData = performUtest(event, 14)
u28diff = u28postData[0] - u28preData[0]

## 14-day
u14pValue, u14preData, u14postData = performUtest(event, 7)
u14diff = u14postData[0] - u14preData[0]

## 6-day
u6pValue, u6preData, u6postData = performUtest(event, 3)
u6diff = u6postData[0] - u6preData[0]

# Creating table 
tableData = {
    'Time Period': ['28 days', '14 days', '6 days'], 
    'Pre-Group Avg. Happiness': [t28preData[0], t14preData[0], t6preData[0]],
    'Post-Group Avg. Happiness': [t28postData[0], t14postData[0], t6postData[0]],
    'Difference in Avg. Happiness': [t28diff, t14diff, t6diff],
    'Sample Size Pre-Group': [t28preData[1], t14preData[1], t6preData[1]],
    'Sample Size Post-Group': [t28postData[1], t14postData[1], t6postData[1]],
    'Welch\'s t-Test p-value': [t28pValue, t14pValue, t6pValue],
    'Mann-Whitney U test p-value': [u28pValue, u14pValue, u6pValue],
}

table = pd.DataFrame(tableData)
styledTable = table.style.format({'Welch\'s t-Test p-value': '{:.2e}', 'Mann-Whitney U test p-value': '{:.2e}'})
styledTable = styledTable.set_properties(**{
    'text-align': 'center',
    'font-size': '14pt',
    'font-family': 'Arial, sans-serif',
    'border-collapse': 'collapse',
    'border': '1px solid #ddd',
    'background-color': '#f7f7f7',
    'color': '#333',
    'padding': '10px',
}).set_table_styles([{    'selector': 'th',    'props': [        ('background-color', '#4CAF50'),        ('color', 'white'),        ('border-top', '1px solid #ddd'),        ('border-bottom', '1px solid #ddd'),        ('font-weight', 'bold'),        ('padding', '10px')    ]
}, {
    'selector': 'td',
    'props': [
        ('border-top', '1px solid #ddd'),
        ('border-bottom', '1px solid #ddd'),
        ('padding', '10px')
    ]
}])

html = styledTable.render()

# Save HTML as PNG image using imgkit
imgkit.from_string(html, f'{plotFile}/{event}AllData.png')

# Performing Welsch t-Tests on filtered data
filteredTweetsDF = tweetsdf[tweetsdf['original_tweet'].str.contains('|'.join(keywords) if len(keywords)>1 else keywords[0])]
## 28-day
t28pValue, t28preData, t28postData = performTtest(event, 14, tweetsdf=filteredTweetsDF)
t28diff = t28postData[0] - t28preData[0]

## 14-day
t14pValue, t14preData, t14postData = performTtest(event, 7, tweetsdf=filteredTweetsDF)
t14diff = t14postData[0] - t14preData[0]

## 6-day
t6pValue, t6preData, t6postData = performTtest(event, 3, tweetsdf=filteredTweetsDF)
t6diff = t6postData[0] - t6preData[0]

# Performing Mann-Whitney u-Tests on all data
## 28-day
u28pValue, u28preData, u28postData = performUtest(event, 14, tweetsdf=filteredTweetsDF)
u28diff = u28postData[0] - u28preData[0]

## 14-day
u14pValue, u14preData, u14postData = performUtest(event, 7, tweetsdf=filteredTweetsDF)
u14diff = u14postData[0] - u14preData[0]

## 6-day
u6pValue, u6preData, u6postData = performUtest(event, 3, tweetsdf=filteredTweetsDF)
u6diff = u6postData[0] - u6preData[0]

# Creating table 
tableData = {
    'Time Period': ['28 days', '14 days', '6 days'], 
    'Pre-Group Avg. Happiness': [t28preData[0], t14preData[0], t6preData[0]],
    'Post-Group Avg. Happiness': [t28postData[0], t14postData[0], t6postData[0]],
    'Difference in Avg. Happiness': [t28diff, t14diff, t6diff],
    'Sample Size Pre-Group': [t28preData[1], t14preData[1], t6preData[1]],
    'Sample Size Post-Group': [t28postData[1], t14postData[1], t6postData[1]],
    'Welch\'s t-Test p-value': [t28pValue, t14pValue, t6pValue],
    'Mann-Whitney U test p-value': [u28pValue, u14pValue, u6pValue],
}

table = pd.DataFrame(tableData)
styledTable = table.style.format({'Welch\'s t-Test p-value': '{:.2e}', 'Mann-Whitney U test p-value': '{:.2e}'})
styledTable = styledTable.set_properties(**{
    'text-align': 'center',
    'font-size': '14pt',
    'font-family': 'Arial, sans-serif',
    'border-collapse': 'collapse',
    'border': '1px solid #ddd',
    'background-color': '#f7f7f7',
    'color': '#333',
    'padding': '10px',
}).set_table_styles([{    'selector': 'th',    'props': [        ('background-color', '#4CAF50'),        ('color', 'white'),        ('border-top', '1px solid #ddd'),        ('border-bottom', '1px solid #ddd'),        ('font-weight', 'bold'),        ('padding', '10px')    ]
}, {
    'selector': 'td',
    'props': [
        ('border-top', '1px solid #ddd'),
        ('border-bottom', '1px solid #ddd'),
        ('padding', '10px')
    ]
}])

html = styledTable.render()
# Save HTML as PNG image using imgkit
imgkit.from_string(html, f'{plotFile}/{event}FilteredData.png')


p-value may not be accurate for N > 5000.



At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 9.00070923871504e-74. var = True
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are not equal.
There is a significant difference between the groups. p = 8.353961399234323e-06. var = False
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 0.054883412532270484. var = True



this method is deprecated in favour of `Styler.to_html()`



Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 0.08410572620200885. var = True
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 0.35519818838663775. var = True
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 0.13053873004570188. var = True



this method is deprecated in favour of `Styler.to_html()`



Loading page (1/2)


True

In [43]:

# Defining event and keyword
event = 'appomattrox' 
dateOffset = pd.DateOffset(days=14)
keywords = ['shooting', 'gun']
title = 'Appomattrox Mass Shooting'

# Virginia
location = [25, 47, -72, -78, 'Virginia']


# Defining where to store plots
plotFile = '../plots/appomattrox'
figW = 800
figH = 600

# Plotting average happiness around 28-day period for all data
plotAvgHappiness(event, keywords, title, 14).write_image(f"{plotFile}/avgHap28day.png", format="png", width=figW, height=figH)
plotAvgHappinessGeo(event, keywords, title, location, 14).write_image(f"{plotFile}/avgHap28daybyLocation.png", format="png", width=figW, height=figH)

# Graphing keyword freq 
## Whole dataset
keywordCountsDF  = pd.DataFrame({'date': tweetsdf['date'],'bool':tweetsdf['original_tweet'].str.contains('|'.join(keywords) if len(keywords)>1 else keywords[0])}).groupby(pd.Grouper(key='date', freq='D')).sum()

# Plotting keyword freq 
px.line(
    data_frame=keywordCountsDF,
    x=keywordCountsDF.index,
    y=keywordCountsDF['bool'],
    labels={'bool': 'Frequency', 'date': 'Date'}
).write_image(f"{plotFile}/freqWhole.png", format="png", width=figW, height=figH)

## Just 28 days
filteredKeywordCountDF = keywordCountsDF.loc[(keywordCountsDF.index >= (negativeEventDates[event] - dateOffset)) & (keywordCountsDF.index <= negativeEventDates[event] + dateOffset)]

# Plotting keyword freq around date
px.line(
    data_frame=filteredKeywordCountDF,
    x=filteredKeywordCountDF.index,
    y=filteredKeywordCountDF['bool'],
    labels={'bool': 'Frequency', 'date': 'Date'}
).write_image(f"{plotFile}/freq28Day.png", format="png", width=figW, height=figH)

# Performing Welsch t-Tests on all data
## 28-day
t28pValue, t28preData, t28postData = performTtest(event, 14)
t28diff = t28postData[0] - t28preData[0]

## 14-day
t14pValue, t14preData, t14postData = performTtest(event, 7)
t14diff = t14postData[0] - t14preData[0]

## 6-day
t6pValue, t6preData, t6postData = performTtest(event, 3)
t6diff = t6postData[0] - t6preData[0]

# Performing Mann-Whitney u-Tests on all data
## 28-day
u28pValue, u28preData, u28postData = performUtest(event, 14)
u28diff = u28postData[0] - u28preData[0]

## 14-day
u14pValue, u14preData, u14postData = performUtest(event, 7)
u14diff = u14postData[0] - u14preData[0]

## 6-day
u6pValue, u6preData, u6postData = performUtest(event, 3)
u6diff = u6postData[0] - u6preData[0]

# Creating table 
tableData = {
    'Time Period': ['28 days', '14 days', '6 days'], 
    'Pre-Group Avg. Happiness': [t28preData[0], t14preData[0], t6preData[0]],
    'Post-Group Avg. Happiness': [t28postData[0], t14postData[0], t6postData[0]],
    'Difference in Avg. Happiness': [t28diff, t14diff, t6diff],
    'Sample Size Pre-Group': [t28preData[1], t14preData[1], t6preData[1]],
    'Sample Size Post-Group': [t28postData[1], t14postData[1], t6postData[1]],
    'Welch\'s t-Test p-value': [t28pValue, t14pValue, t6pValue],
    'Mann-Whitney U test p-value': [u28pValue, u14pValue, u6pValue],
}

table = pd.DataFrame(tableData)
styledTable = table.style.format({'Welch\'s t-Test p-value': '{:.2e}', 'Mann-Whitney U test p-value': '{:.2e}'})
styledTable = styledTable.set_properties(**{
    'text-align': 'center',
    'font-size': '14pt',
    'font-family': 'Arial, sans-serif',
    'border-collapse': 'collapse',
    'border': '1px solid #ddd',
    'background-color': '#f7f7f7',
    'color': '#333',
    'padding': '10px',
}).set_table_styles([{    'selector': 'th',    'props': [        ('background-color', '#4CAF50'),        ('color', 'white'),        ('border-top', '1px solid #ddd'),        ('border-bottom', '1px solid #ddd'),        ('font-weight', 'bold'),        ('padding', '10px')    ]
}, {
    'selector': 'td',
    'props': [
        ('border-top', '1px solid #ddd'),
        ('border-bottom', '1px solid #ddd'),
        ('padding', '10px')
    ]
}])

html = styledTable.render()

# Save HTML as PNG image using imgkit
imgkit.from_string(html, f'{plotFile}/{event}AllData.png')

# Performing Welsch t-Tests on filtered data
filteredTweetsDF = tweetsdf[tweetsdf['original_tweet'].str.contains('|'.join(keywords) if len(keywords)>1 else keywords[0])]
## 28-day
t28pValue, t28preData, t28postData = performTtest(event, 14, tweetsdf=filteredTweetsDF)
t28diff = t28postData[0] - t28preData[0]

## 14-day
t14pValue, t14preData, t14postData = performTtest(event, 7, tweetsdf=filteredTweetsDF)
t14diff = t14postData[0] - t14preData[0]

## 6-day
t6pValue, t6preData, t6postData = performTtest(event, 3, tweetsdf=filteredTweetsDF)
t6diff = t6postData[0] - t6preData[0]

# Performing Mann-Whitney u-Tests on all data
## 28-day
u28pValue, u28preData, u28postData = performUtest(event, 14, tweetsdf=filteredTweetsDF)
u28diff = u28postData[0] - u28preData[0]

## 14-day
u14pValue, u14preData, u14postData = performUtest(event, 7, tweetsdf=filteredTweetsDF)
u14diff = u14postData[0] - u14preData[0]

## 6-day
u6pValue, u6preData, u6postData = performUtest(event, 3, tweetsdf=filteredTweetsDF)
u6diff = u6postData[0] - u6preData[0]

# Creating table 
tableData = {
    'Time Period': ['28 days', '14 days', '6 days'], 
    'Pre-Group Avg. Happiness': [t28preData[0], t14preData[0], t6preData[0]],
    'Post-Group Avg. Happiness': [t28postData[0], t14postData[0], t6postData[0]],
    'Difference in Avg. Happiness': [t28diff, t14diff, t6diff],
    'Sample Size Pre-Group': [t28preData[1], t14preData[1], t6preData[1]],
    'Sample Size Post-Group': [t28postData[1], t14postData[1], t6postData[1]],
    'Welch\'s t-Test p-value': [t28pValue, t14pValue, t6pValue],
    'Mann-Whitney U test p-value': [u28pValue, u14pValue, u6pValue],
}

table = pd.DataFrame(tableData)
styledTable = table.style.format({'Welch\'s t-Test p-value': '{:.2e}', 'Mann-Whitney U test p-value': '{:.2e}'})
styledTable = styledTable.set_properties(**{
    'text-align': 'center',
    'font-size': '14pt',
    'font-family': 'Arial, sans-serif',
    'border-collapse': 'collapse',
    'border': '1px solid #ddd',
    'background-color': '#f7f7f7',
    'color': '#333',
    'padding': '10px',
}).set_table_styles([{    'selector': 'th',    'props': [        ('background-color', '#4CAF50'),        ('color', 'white'),        ('border-top', '1px solid #ddd'),        ('border-bottom', '1px solid #ddd'),        ('font-weight', 'bold'),        ('padding', '10px')    ]
}, {
    'selector': 'td',
    'props': [
        ('border-top', '1px solid #ddd'),
        ('border-bottom', '1px solid #ddd'),
        ('padding', '10px')
    ]
}])

html = styledTable.render()
# Save HTML as PNG image using imgkit
imgkit.from_string(html, f'{plotFile}/{event}FilteredData.png')


p-value may not be accurate for N > 5000.



At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are not equal.
There is a significant difference between the groups. p = 0.006667688598761199. var = False
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are not equal.
There is a significant difference between the groups. p = 0.4995133422811191. var = False
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 0.2234390365702976. var = True



this method is deprecated in favour of `Styler.to_html()`



Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 0.516312186682454. var = True
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 0.630730738064629. var = True
At least one sample is not normally distributed. However, sample size is large enough to ignore.
Variances are equal.
There is no significant difference between the groups. p = 0.7939521545801682. var = True



this method is deprecated in favour of `Styler.to_html()`



Loading page (1/2)


True

In [44]:

# Defining event and keyword
event = 'fort-hood' 
dateOffset = pd.DateOffset(days=14)
keywords = ['shooting', 'gun']
title = 'Fort Hood Mass Shooting'

# Texas
locations = [ 25, 36, -94, -106. 'Texas']

# Defining where to store plots
plotFile = '../plots/fort-hood'
figW = 800
figH = 600

# Plotting average happiness around 28-day period for all data
plotAvgHappiness(event, keywords, title, 14).write_image(f"{plotFile}/avgHap28day.png", format="png", width=figW, height=figH)
plotAvgHappinessGeo(event, keywords, title, location, 14).write_image(f"{plotFile}/avgHap28daybyLocation.png", format="png", width=figW, height=figH)

# Graphing keyword freq 
## Whole dataset
keywordCountsDF  = pd.DataFrame({'date': tweetsdf['date'],'bool':tweetsdf['original_tweet'].str.contains('|'.join(keywords) if len(keywords)>1 else keywords[0])}).groupby(pd.Grouper(key='date', freq='D')).sum()

# Plotting keyword freq 
px.line(
    data_frame=keywordCountsDF,
    x=keywordCountsDF.index,
    y=keywordCountsDF['bool'],
    labels={'bool': 'Frequency', 'date': 'Date'}
).write_image(f"{plotFile}/freqWhole.png", format="png", width=figW, height=figH)

## Just 28 days
filteredKeywordCountDF = keywordCountsDF.loc[(keywordCountsDF.index >= (negativeEventDates[event] - dateOffset)) & (keywordCountsDF.index <= negativeEventDates[event] + dateOffset)]

# Plotting keyword freq around date
px.line(
    data_frame=filteredKeywordCountDF,
    x=filteredKeywordCountDF.index,
    y=filteredKeywordCountDF['bool'],
    labels={'bool': 'Frequency', 'date': 'Date'}
).write_image(f"{plotFile}/freq28Day.png", format="png", width=figW, height=figH)

# Performing Welsch t-Tests on all data
## 28-day
t28pValue, t28preData, t28postData = performTtest(event, 14)
t28diff = t28postData[0] - t28preData[0]

## 14-day
t14pValue, t14preData, t14postData = performTtest(event, 7)
t14diff = t14postData[0] - t14preData[0]

## 6-day
t6pValue, t6preData, t6postData = performTtest(event, 3)
t6diff = t6postData[0] - t6preData[0]

# Performing Mann-Whitney u-Tests on all data
## 28-day
u28pValue, u28preData, u28postData = performUtest(event, 14)
u28diff = u28postData[0] - u28preData[0]

## 14-day
u14pValue, u14preData, u14postData = performUtest(event, 7)
u14diff = u14postData[0] - u14preData[0]

## 6-day
u6pValue, u6preData, u6postData = performUtest(event, 3)
u6diff = u6postData[0] - u6preData[0]

# Creating table 
tableData = {
    'Time Period': ['28 days', '14 days', '6 days'], 
    'Pre-Group Avg. Happiness': [t28preData[0], t14preData[0], t6preData[0]],
    'Post-Group Avg. Happiness': [t28postData[0], t14postData[0], t6postData[0]],
    'Difference in Avg. Happiness': [t28diff, t14diff, t6diff],
    'Sample Size Pre-Group': [t28preData[1], t14preData[1], t6preData[1]],
    'Sample Size Post-Group': [t28postData[1], t14postData[1], t6postData[1]],
    'Welch\'s t-Test p-value': [t28pValue, t14pValue, t6pValue],
    'Mann-Whitney U test p-value': [u28pValue, u14pValue, u6pValue],
}

table = pd.DataFrame(tableData)
styledTable = table.style.format({'Welch\'s t-Test p-value': '{:.2e}', 'Mann-Whitney U test p-value': '{:.2e}'})
styledTable = styledTable.set_properties(**{
    'text-align': 'center',
    'font-size': '14pt',
    'font-family': 'Arial, sans-serif',
    'border-collapse': 'collapse',
    'border': '1px solid #ddd',
    'background-color': '#f7f7f7',
    'color': '#333',
    'padding': '10px',
}).set_table_styles([{    'selector': 'th',    'props': [        ('background-color', '#4CAF50'),        ('color', 'white'),        ('border-top', '1px solid #ddd'),        ('border-bottom', '1px solid #ddd'),        ('font-weight', 'bold'),        ('padding', '10px')    ]
}, {
    'selector': 'td',
    'props': [
        ('border-top', '1px solid #ddd'),
        ('border-bottom', '1px solid #ddd'),
        ('padding', '10px')
    ]
}])

html = styledTable.render()

# Save HTML as PNG image using imgkit
imgkit.from_string(html, f'{plotFile}/{event}AllData.png')

# Performing Welsch t-Tests on filtered data
filteredTweetsDF = tweetsdf[tweetsdf['original_tweet'].str.contains('|'.join(keywords) if len(keywords)>1 else keywords[0])]
## 28-day
t28pValue, t28preData, t28postData = performTtest(event, 14, tweetsdf=filteredTweetsDF)
t28diff = t28postData[0] - t28preData[0]

## 14-day
t14pValue, t14preData, t14postData = performTtest(event, 7, tweetsdf=filteredTweetsDF)
t14diff = t14postData[0] - t14preData[0]

## 6-day
t6pValue, t6preData, t6postData = performTtest(event, 3, tweetsdf=filteredTweetsDF)
t6diff = t6postData[0] - t6preData[0]

# Performing Mann-Whitney u-Tests on all data
## 28-day
u28pValue, u28preData, u28postData = performUtest(event, 14, tweetsdf=filteredTweetsDF)
u28diff = u28postData[0] - u28preData[0]

## 14-day
u14pValue, u14preData, u14postData = performUtest(event, 7, tweetsdf=filteredTweetsDF)
u14diff = u14postData[0] - u14preData[0]

## 6-day
u6pValue, u6preData, u6postData = performUtest(event, 3, tweetsdf=filteredTweetsDF)
u6diff = u6postData[0] - u6preData[0]

# Creating table 
tableData = {
    'Time Period': ['28 days', '14 days', '6 days'], 
    'Pre-Group Avg. Happiness': [t28preData[0], t14preData[0], t6preData[0]],
    'Post-Group Avg. Happiness': [t28postData[0], t14postData[0], t6postData[0]],
    'Difference in Avg. Happiness': [t28diff, t14diff, t6diff],
    'Sample Size Pre-Group': [t28preData[1], t14preData[1], t6preData[1]],
    'Sample Size Post-Group': [t28postData[1], t14postData[1], t6postData[1]],
    'Welch\'s t-Test p-value': [t28pValue, t14pValue, t6pValue],
    'Mann-Whitney U test p-value': [u28pValue, u14pValue, u6pValue],
}

table = pd.DataFrame(tableData)
styledTable = table.style.format({'Welch\'s t-Test p-value': '{:.2e}', 'Mann-Whitney U test p-value': '{:.2e}'})
styledTable = styledTable.set_properties(**{
    'text-align': 'center',
    'font-size': '14pt',
    'font-family': 'Arial, sans-serif',
    'border-collapse': 'collapse',
    'border': '1px solid #ddd',
    'background-color': '#f7f7f7',
    'color': '#333',
    'padding': '10px',
}).set_table_styles([{    'selector': 'th',    'props': [        ('background-color', '#4CAF50'),        ('color', 'white'),        ('border-top', '1px solid #ddd'),        ('border-bottom', '1px solid #ddd'),        ('font-weight', 'bold'),        ('padding', '10px')    ]
}, {
    'selector': 'td',
    'props': [
        ('border-top', '1px solid #ddd'),
        ('border-bottom', '1px solid #ddd'),
        ('padding', '10px')
    ]
}])

html = styledTable.render()
# Save HTML as PNG image using imgkit
imgkit.from_string(html, f'{plotFile}/{event}FilteredData.png')

SyntaxError: invalid syntax (4261863952.py, line 8)