In [None]:
import pandas as pd, pyarrow, ast, emoji, regex, json, csv, os
from emosent import get_emoji_sentiment_rank

# Data Cleaning and Pre-Processing
## Splitting the Data
Handling the data in such a large format is not manageable. First we'll go ahead and split the data into more manageable file sizes. I've written the function below
to help me manage large CSV files. The default row_limit prior to splitting is 10,000. Because this is such a large dataset, I've split the data into 50,000 rows per CSV and stored them under `../gda_data/interim/split_files`.

It should be noted that splitting this large file does take a while.

In [None]:
def split_file(filehandler: object, delimiter: str =',', row_limit: int =10000, output_name_template: str ='output_%s.csv', output_path: str ='.', keep_headers: bool = True):
    """
    Filehandler object opens file for splitting
    Splits file into the number of rows determined by the method argument (default is 10,000 rows). Default delimiter is comma but can be changed by passing a method argument.
    Output_name_template is the file naming convention passed with an incrementer number included in the file name. The default output is csv file. The default path argument
    is set to the current directory. The keep_headers argument outputs file headers into each new file split and the default value is True.
    """
    reader = csv.reader(filehandler, delimiter=delimiter)
    current_piece = 1
    current_out_path = os.path.join(
         output_path,
         output_name_template  % current_piece
    )
    current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
    current_limit = row_limit
    if keep_headers:
        headers = next(reader)
        current_out_writer.writerow(headers)
    for i, row in enumerate(reader):
        if i + 1 > current_limit:
            current_piece += 1
            current_limit = row_limit * current_piece
            current_out_path = os.path.join(
               output_path,
               output_name_template  % current_piece
            )
            current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
            if keep_headers:
                current_out_writer.writerow(headers)
        current_out_writer.writerow(row)

In [None]:
# Split the data set into files of 50,000 rows.
split_file(open('../gda_data/raw/2021-all-Ads-tweets.csv','r'), row_limit=50000, output_path='../gda_data/interim/split_csv')

## Removing Unncessary Data & Changing Data Strings
To improve processing time, we'll go ahead and drop some columns we don't need from the csv files to improve memory management and processing. We'll go ahead and store the data under a new
directory `./data/clean_data`. We'll also be changing the serialization of the files from .csv to .feather file types for faster processing.

The `ast.literal_eval` function changesthe string to be loaded as a dictionary in a pandas dataframe. We'll do this for both the entities and user columns.

In [None]:
columns_to_drop = ['display_text_range', 'geo', 'in_reply_to_status_id', 'scopes', 'possibly_sensitive', 'quoted_status_id', 'quoted_status_id_str', 'truncated', 'quoted_status_permalink', 'filter_level', 'quoted_status', 'contributors', 'coordinates', 'display_text_range','extended_tweet', 'extended_entities', 'matching_rules','extended_tweet', 'is_quote_status', 'place', 'scopes', 'Table Name1', 'F1' , 'id_str', 'quoted_status_id', 'quoted_status_id_str']
for file in os.listdir('../gda_data/split_csv'):
    df = pd.read_csv(f'../gda_data/split_csv/{file}')
    df = df.drop(columns_to_drop, axis=1)
    df['entities'] = df['entities'].apply(lambda x: ast.literal_eval(str(x)))
    df['user'] = df['user'].apply(lambda x: ast.literal_eval(str(x)))
    new_file_name = file.replace('.csv', '.feather')
    
    df.to_feather(f'../gda_data/processed/feathers/{new_file_name}')

## Dealing with Unique Data -- EMOJIS 😊
There are many types of media that are used today in social media. They are able to communicate in a totally unique way beyond with what we do with text. They communicate ideas and feelings in context. We'll go ahead use the below function to identify emojis and add them to a new column 'emojis' in the dataframe before writing them back to file. We'll also build our emoji_tracker dictionary and write that to a .json file as well.

In [None]:
emoji_tracker = {}
def find_emojis(text: str):
    """ Takes a string of text as an argument and identifies emojis and returns a record_level_emoji list to be applied on each row in the Pandas dataframe. The count and sentiment of the emoji is added to the emoji_tracker dictionary."""
    record_level_emoji = []
    emojis = regex.findall(r'\X', text)
    for e in emojis:
        if any(char in emoji.UNICODE_EMOJI['en'] for char in e):
            record_level_emoji.append(e)
            if e in emoji_tracker:
                emoji_tracker[e]['count'] += 1
            else:
                try:
                    emoji_tracker[e] = {
                        'count': 1,
                        'sentiment': get_emoji_sentiment_rank(e)
                    }
                except:
                    emoji_tracker[e] = {
                        'count': 1
                    }

    return record_level_emoji

In [None]:
for file in os.listdir('data/clean_data'):
    df = pd.read_feather(f'data/clean_data/{file}')
    df['emojis'] = df['text'].apply(find_emojis)
    df.to_feather(f'data/clean_data/{file}')

## Store Our Data
Let's go ahead and store our emoji_tracker dictionary as a `.json` file under `data/descriptions` and write it out to a `.csv`.

In [None]:
with open('data/descriptions/emojis.json', 'w') as fh:
    fh.write(json.dumps(emoji_tracker))

In [None]:
emoji_df = pd.DataFrame.from_dict(emoji_tracker, orient='index')
emoji_df = pd.concat([emoji_df.drop(['sentiment'], axis=1), emoji_df['sentiment'].apply(pd.Series)] , axis=1)
emoji_df = emoji_df.drop([0], axis=1)
emoji_df.to_csv('exported_data/emojis.csv', index=True)