In [None]:
import pandas as pd
from database.connect import getConnection
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import seaborn as sns
sns.set()
from pathlib import Path
import csv

In [None]:
try:
    connection = getConnection()
except Exception:
    print("✖️ Error while connecting to MySQL engine database.")
    print("ℹ️ Please make sure the environment file `.env` is located at"+
        "the project root directory and contains proper configuration.")
    raise

cursor = connection.cursor()

categories = {'booking': ['booking', 'booked', 'book', 'ticket', 'tickets'],
            'canceling': ['canceled', 'cancellations'],
            'money': ['refund', 'compensation', 'claim', 'money', 'pay', 'paid'],
            'baggage': ['bag', 'baggage', 'luggage', 'bags'],
            'staff': ['staff', 'crew'],
            'waiting': ['waiting', 'delay', 'wait'],
            'boarding': ['boarding'],
            'stuck': ['stuck'],

            'information': ['info', 'information'],
            'customers': ['customer', 'customers', 'passenger', 'passengers'],
            'dm': [' dm'], 
            }

airlines_dict = {'KLM': ['klm'],
                'AirFrance':['airfrance',
                            'air france'],
                'British_Airways': ['british_airways',
                                    'british airways'],
                'AmericanAir': ['americanair',
                                'american airlines'],
                'Lufthansa': ['lufthansa'],
                'AirBerlin': ['airberlin',
                                'air berlin'],
                'AirBerlin assist': ['airberlin assist',
                                    'air berlin assist',
                                    'airberlinassist'],
                'easyJet': ['easyjet'],
                'RyanAir': ['ryanair'],
                'SingaporeAir': ['singaporeair',
                                'singapore airlines'],
                'Qantas': ['qantas'],
                'EtihadAirways': ['etihad airways',
                                'etihadairways',
                                'etihad'],
                'VirginAtlantic': ['virgin atlantic',
                                    'virginatlantic'],
            }

airlines_of_interest = ['AmericanAir', 'Other', 'British_Airways']

# Preprocessing for catgegorisation

In [None]:
# create a dataframe with the counts of each category
excluided = ['dm', 'customers', 'information']
relevant_categories = [key for key in categories.keys() if key not in excluided]
columns = ['timestamp_ms'] + relevant_categories
df_category_counts = pd.DataFrame(columns=columns)

sum_categories = ""

for category in relevant_categories:
    sum_categories += f"{category}, " 



for airline in airlines_of_interest:
    file_path = Path(f'./pre-processed/{airline}_category_counts.csv')
    if file_path.exists():
        print(f"file {airline}_category_counts.csv already exists")
        continue

    # check if the file already exists

    if airline == 'Other':
        query = f"""SELECT timestamp_ms, {sum_categories[:-2]}
                    FROM tweets, part_of, conversations     
                    WHERE tweets.id = part_of.tID
                    AND part_of.cID = conversations.id
                    AND `text` NOT LIKE 'RT%'
                    AND language = 'en'
                    AND conversations.airline NOT LIKE '[]'
                    AND conversations.airline NOT LIKE '%British_Airways%'
                    AND conversations.airline NOT LIKE '%AmericanAir%'
                    """
    else:
            query = f"""SELECT timestamp_ms, {sum_categories[:-2]}
                    FROM tweets, part_of, conversations
                    WHERE tweets.id = part_of.tID
                    AND part_of.cID = conversations.id
                    AND `text` NOT LIKE 'RT%'
                    AND language = 'en'
                    AND conversations.airline LIKE '%{airline}%'
                    """

    cursor.execute(query)
    results = cursor.fetchall()

    print(f'finished query for {airline}')

    # save the results to a csv file
    df = pd.DataFrame(results, columns=columns)
    print(df.head())
    df.to_csv(f'./pre-processed/{airline}_category_counts.csv', index=False)

# df_category_counts

# The preprocessing for the sentiment per bin flailing line

In [None]:
min_tweets = 5
file_path = Path("./output/sentiment_per_bin.csv")

# check if the file exists
my_file = file_path
if not my_file.is_file():

    # get all the sentiment of tweets in position i
    query = f"""
    SELECT bin_id, (sentiment_sum / tweet_count) as sentiment, Tstart, Tend, bin_position
    FROM binned_sentiment, conversations as c
    WHERE c.id = binned_sentiment.cID;
    """
    cursor.execute(query)
    result = cursor.fetchall()

    # create the file if it doesn't exist
    with open(file_path, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["bin_id", "sentiment", "Tstart", "Tend", "bin_position"])
    # print(dirty_result)
    # clean_result = [float(x[1]) for x in dirty_result]
    # start_time = [int(x[2]) for x in dirty_result]
    # end_time = [int(x[3]) for x in dirty_result]
    # write the sentiment to a csv file
    with open(file_path, "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        for row in result:
            writer.writerow(row)
