In [None]:
from scipy.stats import ttest_ind
import numpy as np
from pathlib import Path
import pandas as pd
from database.connect import getConnection
import matplotlib.pyplot as plt
from datetime import datetime
import statsmodels.api as sm
#This file will attempt to test if our sentiment change is significantly different from 0.

In [None]:
start_date = '01-01-2000'
end_date = '01-01-2023'

# convert start and end date to unix timestamp in milliseconds
start_date_unix = int(datetime.strptime(start_date, '%d-%m-%Y').timestamp() * 1000)
end_date_unix = int(datetime.strptime(end_date, '%d-%m-%Y').timestamp() * 1000)

# create a part you can insert into your where clause
date_restriction = f" AND tweets.timestamp_ms >= {start_date_unix} AND tweets.timestamp_ms <= {end_date_unix}"
connection = getConnection()
cursor = connection.cursor()

In [None]:
file_path = Path("./output/sentiment_per_bin.csv")
df = pd.read_csv(file_path)
#df is now the data of sentiment per bin.
d_newcomb = sm.stats.DescrStatsW(df['sentiment'])
print(f"{d_newcomb.ttest_mean(0, alternative='two-sided')}, t-stat, p-value, df")
print(f"{d_newcomb.tconfint_mean(alpha=0.05, alternative='two-sided')}, CI")

In [None]:
cursor.execute(f""" SELECT bs.bin_id, bs.cID, bs.bin_position, bs.break_airline, bs.sentiment_sum, bs.tweet_count, c.Tstart, c.Tend
                    FROM binned_sentiment bs, conversations c
                    WHERE bs.cID = c.id AND bs.break_airline NOT LIKE 'prev=%'""")
binned_sentiment = cursor.fetchall()
#all bins from conversations with more than 1 bin.

query = f""" SELECT CASE WHEN bs1.break_airline='AmericanAir' THEN "American Air" 
                        WHEN bs1.break_airline = 'British_Airways' THEN "British Airways" 
                        ELSE "Other Airlines" END AS Airline, 
            CASE WHEN (bs1.sentiment_sum / bs1.tweet_count) > (bs2.sentiment_sum / bs2.tweet_count) THEN -1*ABS((bs1.sentiment_sum / bs1.tweet_count) - (bs2.sentiment_sum / bs2.tweet_count))
            ELSE ABS((bs1.sentiment_sum / bs1.tweet_count) - (bs2.sentiment_sum / bs2.tweet_count))
            END AS Average_sent_diff
            FROM `binned_sentiment` bs1, `binned_sentiment` bs2
            WHERE bs1.cID = bs2.cID AND bs2.bin_position = bs1.bin_position + 1 
            AND bs1.cID IN (SELECT id
                            FROM conversations
                            WHERE Tstart > {start_date_unix} AND Tend < {end_date_unix})"""
cursor.execute(query) #the airlines and average sentiment change per 
sentiment_change = cursor.fetchall()

In [None]:
df_sentiment_change = pd.DataFrame(sentiment_change)
df_sentiment_change.rename(columns ={0:"airline",1:"sentiment"}, inplace=True)
df_sentiment_change["sentiment"] = df_sentiment_change["sentiment"].astype(float)

In [None]:
d_newcomb = sm.stats.DescrStatsW(df_sentiment_change['sentiment'])
print(f"{d_newcomb.ttest_mean(0, alternative='two-sided')}, t-stat, p-value, df")
print(f"{d_newcomb.tconfint_mean(alpha=0.05, alternative='two-sided')}, CI")
#The significance of all tweets together.

In [None]:
df_other = df_sentiment_change[df_sentiment_change['airline']=="Other Airlines"]
df_aa = df_sentiment_change[df_sentiment_change['airline']=="American Air"]
df_ba = df_sentiment_change[df_sentiment_change['airline']=="British Airways"]

In [None]:
d_newcomb = sm.stats.DescrStatsW(df_aa['sentiment'])
print('American Airlines')
print(f"{d_newcomb.ttest_mean(0, alternative='two-sided')}, t-stat, p-value, df")
print(f"{d_newcomb.tconfint_mean(alpha=0.05, alternative='two-sided')}, CI")

d_newcomb = sm.stats.DescrStatsW(df_other['sentiment'])
print('Other Airlines')
print(f"{d_newcomb.ttest_mean(0, alternative='two-sided')}, t-stat, p-value, df")
print(f"{d_newcomb.tconfint_mean(alpha=0.05, alternative='two-sided')}, CI")

d_newcomb = sm.stats.DescrStatsW(df_ba['sentiment'])
print('British Airways')
print(f"{d_newcomb.ttest_mean(0, alternative='two-sided')}, t-stat, p-value, df")
print(f"{d_newcomb.tconfint_mean(alpha=0.05, alternative='two-sided')}, CI")