In [16]:
import pandas as pd
import mysql.connector
import os
from datetime import datetime
from tqdm import tqdm
import configparser
import json

In [2]:
import warnings

# Suppressing the warnings
warnings.filterwarnings('ignore') 

In [3]:
def open_connection():

    #getting configuration
    config = configparser.ConfigParser()
    config.read('config.ini')

    # Connect to Mysql
    conn = mysql.connector.connect(
        host = config['mysql']['host'],
        user = config['mysql']['user'],
        password = config['mysql']['password'],
        database = config['mysql']['database']
    )
        
    return conn 

In [4]:
def check_folder(name_folder):            
    #Creating folder if that doesn't exist
    p = %pwd
    p = p + f'/{name_folder}'
    path = os.path.expanduser(p)
    if not os.path.exists(path):
        os.makedirs(path)
        print("{} created.".format(path))
        
    return path

In [5]:
def humanize(size_bytes):
    KB = 1 << 10
    MB = 1 << 20
    GB = 1 << 30

    if size_bytes < KB:
        return '{} B'.format(size_bytes)
    elif size_bytes < MB:
        return '{:.2f} KiB'.format(size_bytes/KB)
    elif size_bytes < GB:
        return '{:.2f} MiB'.format(size_bytes/MB)
    else:
        return '{:.2f} GiB'.format(size_bytes/GB)

## Export Control

### Get Info do Control CSV

In [18]:
download_folder = check_folder('Downloads/202201')
control_csv = (f'{download_folder}/control.csv')

df = pd.read_csv(control_csv)
df = df.fillna(value=0)

down_df = df[df['name'].str.startswith('twitter')]
ext_df= df[~df['name'].str.startswith('twitter')]


ext_df['data'] = ext_df['name'].str[:8].apply(lambda x: datetime.strptime(x, '%Y%m%d').strftime('%Y-%m-%d'))
ext_df['data'] = pd.to_datetime(ext_df['data'])
ext_df['size'] = ext_df['size'].astype(float)
ext_df['count_total'] = ext_df['count_total'].astype(float)
ext_df['count_filtered'] = ext_df['count_filtered'].astype(float)


donwloaded = ext_df.groupby(pd.Grouper(key='data', freq='M'))['size'].sum()
print(f"Total of days: {len(down_df)} \n")

print(f"Total Downloaded: {humanize(donwloaded.sum())}")
print(f"An average of {humanize(donwloaded.sum()/len(down_df))} per day downloaded \n")

print("Total of Tweets: {:,}".format(df['count_total'].sum()))
print(f"An average of {ext_df['count_total'].sum()/len(down_df):,.0f} tweets downloaded per day \n")

print("Tweets filtered: {:,}".format(df['count_filtered'].sum()))
print(f"An average of {ext_df['count_filtered'].sum()/len(down_df):,.0f} tweets filtered per day \n")

df.sample(1)

Total of days: 31 

Total Downloaded: 77.73 GiB
An average of 2.51 GiB per day downloaded 

Total of Tweets: 120,699,166.0
An average of 3,893,521 tweets downloaded per day 

Tweets filtered: 127,261.0
An average of 4,105 tweets filtered per day 



Unnamed: 0,name,datetime,type,size,count_total,count_filtered
23016,20220117/20220117221500.json.gz,2023-05-05 22:30:58,extraction,1642654,2473.0,1.0


### Insert into Mysql

In [19]:
conn = open_connection()
cursor = conn.cursor()

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    name = row['name']
    datetime = row['datetime']
    type_ = row['type']
    size = row['size']
    count_total = row['count_total']
    count_filtered = row['count_filtered']
    sql = "INSERT INTO control (name, datetime, type, size, count_total, count_filtered) VALUES (%s, %s, %s, %s, %s, %s)"
    cursor.execute(sql, (name, datetime, type_, size, count_total, count_filtered))

conn.commit()
cursor.close()
conn.close()


100%|███████████████████████████████████| 44368/44368 [00:22<00:00, 1940.35it/s]


## Export Tweets

In [20]:
download_folder = check_folder('Downloads/202201')
control_csv = (f'{download_folder}/202201_tweets.csv')

df = pd.read_csv(control_csv)
df = df.fillna(value=0)

print(f"Total of Tweets: {len(df):,} \n")

df.sample(1)

Total of Tweets: 127,261 



Unnamed: 0,created_at,text,entities
53461,2022-01-13 22:00:03+00:00,RT @ToniaBuxton: How much more of this needs t...,"{""hashtags"": [], ""urls"": [], ""user_mentions"": ..."


### Insert on MySQL

In [21]:
conn = open_connection()
cursor = conn.cursor()

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    created_at = row['created_at']
    text = row['text']
    entities = json.dumps(row['entities'])
    sql = "INSERT INTO tweets (created_at, text, entities) VALUES (%s, %s, %s)"
    cursor.execute(sql, (created_at, text, entities))

conn.commit()
cursor.close()
conn.close()


100%|█████████████████████████████████| 127261/127261 [01:18<00:00, 1627.52it/s]
