In [75]:
import pandas as pd
from database.connect import getConnection
import matplotlib.pyplot as plt
from datetime import datetime



In [76]:
start_date = '01-01-2000'
end_date = '01-01-2023'

# convert start and end date to unix timestamp in milliseconds
start_date_unix = int(datetime.strptime(start_date, '%d-%m-%Y').timestamp() * 1000)
end_date_unix = int(datetime.strptime(end_date, '%d-%m-%Y').timestamp() * 1000)

# create a part you can insert into your where clause
date_restriction = f" AND tweets.timestamp_ms >= {start_date_unix} AND tweets.timestamp_ms <= {end_date_unix}"

In [77]:
try:
    connection = getConnection()
except Exception:
    print("✖️ Error while connecting to MySQL engine database.")
    print("ℹ️ Please make sure the environment file `.env` is located at"+
        "the project root directory and contains proper configuration.")
    raise

cursor = connection.cursor()

categories = {'booking': ['booking', 'booked', 'book', 'ticket', 'tickets'],
            'canceling': ['canceled', 'cancellations'],
            'money': ['refund', 'compensation', 'claim', 'money', 'pay', 'paid'],
            'baggage': ['bag', 'baggage', 'luggage', 'bags'],
            'staff': ['staff', 'crew'],
            'waiting': ['waiting', 'delay', 'wait'],
            'boarding': ['boarding'],
            'stuck': ['stuck'],

            'information': ['info', 'information'],
            'customers': ['customer', 'customers', 'passenger', 'passengers'],
            'dm': [' dm'], 
            }

airlines_dict = {'KLM': ['klm'],
                'AirFrance':['airfrance',
                            'air france'],
                'British_Airways': ['british_airways',
                                    'british airways'],
                'AmericanAir': ['americanair',
                                'american airlines'],
                'Lufthansa': ['lufthansa'],
                'AirBerlin': ['airberlin',
                                'air berlin'],
                'AirBerlin assist': ['airberlin assist',
                                    'air berlin assist',
                                    'airberlinassist'],
                'easyJet': ['easyjet'],
                'RyanAir': ['ryanair'],
                'SingaporeAir': ['singaporeair',
                                'singapore airlines'],
                'Qantas': ['qantas'],
                'EtihadAirways': ['etihad airways',
                                'etihadairways',
                                'etihad'],
                'VirginAtlantic': ['virgin atlantic',
                                    'virginatlantic'],
            }

airlines_of_interest = ['AmericanAir', 'Other', 'British_Airways']

In [78]:
# create a dataframe with the counts of each category
excluided = ['dm', 'customers', 'information']
relevant_categories = [key for key in categories.keys() if key not in excluided]

df = pd.DataFrame(columns=relevant_categories)

sum_categories = ""

for category in relevant_categories:
    sum_categories += f"SUM({category}), " 



for airline in airlines_of_interest:

    if airline == 'Other':
        query = f"""SELECT {sum_categories[:-2]}
                    FROM tweets, part_of, conversations     
                    WHERE tweets.id = part_of.tID
                    AND part_of.cID = conversations.id
                    AND `text` NOT LIKE 'RT%'
                    AND language = 'en'
                    AND conversations.airline NOT LIKE '[]'
                    AND conversations.airline NOT LIKE '%British_Airways%'
                    AND conversations.airline NOT LIKE '%AmericanAir%'
                    {date_restriction}
                    """
    else:
            query = f"""SELECT {sum_categories[:-2]}
                    FROM tweets, part_of, conversations
                    WHERE tweets.id = part_of.tID
                    AND part_of.cID = conversations.id
                    AND `text` NOT LIKE 'RT%'
                    AND language = 'en'
                    AND conversations.airline LIKE '%{airline}%'
                    {date_restriction}
                    """


    
    cursor.execute(query)
    results = cursor.fetchall()

    # add the results to the dataframe
    df.loc[airline] = results[0]


    print(f"finished {airline}")

    # add the total number of tweets to the dataframe



df

finished AmericanAir
finished Other
finished British_Airways


Unnamed: 0,booking,canceling,money,baggage,staff,waiting,boarding,stuck
AmericanAir,5152,908,4487,5002,2679,9389,1105,573
Other,25157,1332,28078,22599,12290,25980,4085,1903
British_Airways,11080,424,9037,4148,3205,9191,633,376


In [79]:
# extract the value of cancelations for american airlines
american_cancelations = df.loc['AmericanAir']['canceling']
print(type(american_cancelations))

american_cancelations

<class 'decimal.Decimal'>


Decimal('908')