# Overview

We'll merge all extra data sets into one CSV.

# Instantiate required Python components.

Our project will use TensorFlow for developing our model.  We'll also need several other Python libraries to work with our CSV.

In [1]:
import re
import pandas as pd
import csv
import numpy as np

# Used for Troubleshooting
from IPython.display import display

# Set Hyperparameters

Merge multiple files into one CSV.  Ensure that the columns are in the following order: reason, singleMessage

In [2]:
# All the files to merge.
FILE_MESSAGES = [
    "./data/output/1-preprocessed-data.csv",
    "./data/output/1-preprocess-clean.csv",
    "./data/output/2-dataset-politics.csv"
]

# Read the CSV Data

In [3]:
# Specify the correct columns
correct_columns = ['reason', 'singleMessage']

# Initialize an empty list to store valid DataFrames
valid_dataframes = []

# Read each CSV file and check if it contains the correct columns
for file in FILE_MESSAGES:
    df = pd.read_csv(file)
    
    if set(df.columns) == set(correct_columns):
        valid_dataframes.append(df)
        print(f"{file} contains the correct columns.")
    else:
        print(f"{file} does not contain the correct columns.")

# Concatenate all valid DataFrames into a single DataFrame
if valid_dataframes:
    combined_df = pd.concat(valid_dataframes, ignore_index=True)
    print("\nCombined DataFrame:")
    print(combined_df)
else:
    print("\nNo valid DataFrames found.")

./data/output/1-preprocessed-data.csv contains the correct columns.
./data/output/1-preprocess-clean.csv contains the correct columns.
./data/output/2-dataset-politics.csv contains the correct columns.

Combined DataFrame:
                                                   reason  \
0       Account number visible. Please remove from con...   
1                                  Inappropriate comment.   
2                                  Caps for tickers only.   
3                                  Caps for tickers only.   
4                                  Inappropriate comment.   
...                                                   ...   
131269  Politics not allowed outside of references to ...   
131270  Politics not allowed outside of references to ...   
131271  Politics not allowed outside of references to ...   
131272  Politics not allowed outside of references to ...   
131273  Politics not allowed outside of references to ...   

                                            

In [4]:
display(combined_df)

Unnamed: 0,reason,singleMessage
0,Account number visible. Please remove from con...,a.b.c.warriortrading.com
1,Inappropriate comment.,mammkd. sdkkf
2,Caps for tickers only.,wattior
3,Caps for tickers only.,wattior
4,Inappropriate comment.,wattior
...,...,...
131269,Politics not allowed outside of references to ...,lil'wayne got a pardon and not them ah ah
131270,Politics not allowed outside of references to ...,So you think it will be called unconstitutiona...
131271,Politics not allowed outside of references to ...,The left of America has out numbered the right...
131272,Politics not allowed outside of references to ...,Everyone spread the word…I just set fire on water


# Shuffe Dataframe

In [5]:
shuffled_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 🚧 Save Data to Disk

Let's save all our hard work formatting the dataframe to a CSV for future reference.

- [Pandas DataFrame.to_csv](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html)

In [6]:
shuffled_df.to_csv('data/output/3-merge-data.csv', index=False)