In [1]:
import os
import sqlalchemy as sa
import pandas as pd
from dotenv import find_dotenv, load_dotenv

ENV_PATH = find_dotenv()
load_dotenv(ENV_PATH)
os.chdir(os.path.dirname(ENV_PATH))

In [2]:
eng = sa.create_engine(os.environ['DB_URI'])
conn = eng.connect()
schemas = sa.inspect(eng).get_schema_names()
print('DB Schemas: ', schemas)

meta = sa.MetaData(conn, schema='main')
meta.reflect()
print('Schema Tables: ', [table for table in meta.tables])

DB Schemas:  ['main']
Schema Tables:  ['main.train']


In [3]:
mydat = pd.read_sql_table(table_name='train', con=conn, schema='main', index_col='id')
mydat.sample(7)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3046,0,"'acche din' for flyers: cap on airfares, lower..."
26051,0,#day in #afghanistan may we have peace in th...
27146,0,a girl is always #pretty #shine #noho
30769,0,the title of my autobiography.
12990,0,#mango #naturalfabrics in highly versatile ea...
19963,0,#fathersdayuk for #sunday :) #dad skull lea...
17006,0,listening to #country #songs after a long we...


In [4]:
mydat.query('label == 1').sample(5)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
29671,1,black men and black women with stockholm syndr...
18838,1,@user president jimmy caer leaving his religio...
20984,1,#scrutinize #savor &amp; #dispose #unbelievabl...
28991,1,attack her chinguas~ &lt;33
3921,1,#mcconnell obstruction goes all the way back t...


In [5]:
n_hate = mydat.label.sum()
n = len(mydat)
print('# of hate tweets:     ', n_hate)
print('# of non-hate tweets: ', n - n_hate)
print('% hate:               ', round(n_hate / n, 4) * 100, '%')

# of hate tweets:      2242
# of non-hate tweets:  29720
% hate:                7.01 %


### Conclusion
There is a very significant class imbalance, and the sample size is too small for me to be comfortable resampling.  Since we have yet to split the data for training, validation, and testing, I would not want to lose too much nuance in the tweets if we only have around 1500 or less positive cases in the training data.  As a result, we'll use the F1 score as the performance metric.

In [6]:
conn.close()