In [1]:
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
folder = "twitter_data"
expected_columns = ['tweetid', 'userid', 'user_display_name', 'user_screen_name',
       'user_reported_location', 'user_profile_description',
       'user_profile_url', 'follower_count', 'following_count',
       'account_creation_date', 'account_language', 'tweet_language',
       'tweet_text', 'tweet_time', 'tweet_client_name', 'in_reply_to_userid',
       'in_reply_to_tweetid', 'quoted_tweet_tweetid', 'is_retweet',
       'retweet_userid', 'retweet_tweetid', 'latitude', 'longitude',
       'quote_count', 'reply_count', 'like_count', 'retweet_count', 'hashtags',
       'urls', 'user_mentions', 'poll_choices']
dtypes = ['float64', 'str', 'str', 'str', 'str', 'str', 'str', 'float64', 'float64', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'float64', 'float64', 'float64', 'float64', 'str', 'str', 'str', 'str']


In [None]:

for file in tqdm(os.listdir(folder)):
    # tqdm.write("File: ", file)
    tqdm.write("File: {}".format(file))
    start_time = time.time()
    df = pd.read_csv(os.path.join(folder, file), dtype=dict(zip(expected_columns, dtypes)), parse_dates=["account_creation_date", "tweet_time"], low_memory=False)
    tqdm.write("Time to read: {}".format(time.time() - start_time))
    tqdm.write("Entries: {}".format(df.shape[0]))
    eng_tweets = df[df["tweet_language"] == "en"]
    tqdm.write("Number of english tweets: {} ({}%)".format(eng_tweets.shape[0], eng_tweets.shape[0] / df.shape[0] * 100))
    columns_diff_1 = set(df.columns.tolist()) - set(expected_columns)
    columns_diff_2 = set(expected_columns) - set(df.columns.tolist())
    tqdm.write("Columns match: {}".format(len(columns_diff_1) == 0 and len(columns_diff_2) == 0))
    if len(columns_diff_1) != 0:
        tqdm.write("Extra columns: {}".format(columns_diff_1))
    if len(columns_diff_2) != 0:
        tqdm.write("Missing columns: {}".format(columns_diff_2))

    # remove from memory
    del df
    tqdm.write("")


In [None]:
# A lot of these datasets are way to big to read into memory, so we need to read them in chunks
# we only want to keep english tweets, so we can filter them out as we read them in
# let's do this manually, without using pandas
import csv

folder = "twitter_data"
output_file = "english_tweets.csv"

for file in tqdm(os.listdir(folder)):
    tqdm.write("File: {}".format(file))
    with open(os.path.join(folder, file), 'r') as f:
        reader = csv.reader(f)
        header = next(reader)

        # find the index of the tweet language column
        tweet_lang_idx = header.index("tweet_language")

        # iterate over the rows, and only write the english ones to the output file
        with open(output_file, 'a') as out:
            writer = csv.writer(out)
            for row in reader:
                if row[tweet_lang_idx] == "en":
                    writer.writerow(row)
    tqdm.write("Time to read: {}".format(time.time() - start_time))
    tqdm.write("")

In [3]:
# # combine all english, non-retweet tweets into a single dataframe
# all_tweets = pd.DataFrame(columns=expected_columns)
# initial_length = 0
# for file in tqdm(os.listdir(folder)):
#     # df = pd.read_csv(os.path.join(folder, file), low_memory=False)
#     df = pd.read_csv(os.path.join(folder, file), dtype=dict(zip(expected_columns, dtypes)), low_memory=False) # parse_dates=["account_creation_date", "tweet_time"], 
#     initial_length += df.shape[0]
#     eng_tweets = df[df["tweet_language"] == "en"]
#     # non_retweets = eng_tweets[eng_tweets["is_retweet"] == False]
#     # all_tweets = pd.concat([all_tweets, non_retweets])
#     all_tweets = pd.concat([all_tweets, eng_tweets])
#     del df
#     del eng_tweets
#     # del non_retweets

# print("Reduced from {} to {} entries".format(initial_length, all_tweets.shape[0]))
# all_tweets = all_tweets.drop(columns=["tweet_language", "is_retweet"])
# # save to csv
# all_tweets.to_csv("all_tweets.csv", index=False)

 70%|███████   | 19/27 [08:20<03:50, 28.87s/it]