# Summary
- use this to clean a tweet csv of known problems

In [1]:
import pandas as pd
import re

#read in data
df = pd.read_csv("tweets_2018.csv", lineterminator='\n') # , index_col=0, set dtype on import 
df.shape

(70159, 12)

### Cleaning

In [2]:
df = df.dropna()
df = df.drop_duplicates()

In [3]:
# apply regex to remove chains of repeating double quotes
df["tweet"] = df.tweet.str.replace('"{2,}', '', regex=True)

In [4]:
# define a regular expression to match emojis
emoji_pattern = re.compile("["
        u"\U0001f600-\U0001f64f"  # emoticons
        u"\U0001f300-\U0001f5ff"  # symbols & pictographs
        u"\U0001f680-\U0001f6ff"  # transport & map symbols
        u"\U0001f1e0-\U0001f1ff"  # flags (iOS)
                           "]+", flags=re.UNICODE)

# replace emojis with an empty string in the 'tweet' column
df['tweet'] = df['tweet'].str.replace(emoji_pattern, '')

For a future iteration, add regex to remove http / outbound links as these did not prove beneficial to my close analysis after topic modeling

In [5]:
df.shape

(70157, 12)

### Bad match row removal

In [6]:
# Define the list of keywords
keywords = ["nuclear","nuclear weapons","bomber","missile","icbm","MMIII","B-52",
            "B-2","airmen","airman","defender","maintenance","maintainer","payload",
            "transporter","erector","air force","housing","security","safety","nuclear security",
            "space force","suicide","harassment","leader","leadership","training","officer"]

# Define a regex pattern based on the list of keywords
pattern = re.compile("|".join(keywords), re.IGNORECASE)

# Use the pattern to create a mask that checks for matches in 'username' column and no matches in 'tweet' column
mask = df['username'].str.contains(pattern) & ~df['tweet'].str.contains(pattern)

# Apply the mask to the DataFrame to filter out rows that do not meet the condition
df = df[~mask]

In [7]:
df.shape

(67631, 12)

### Export

In [8]:
# export cleaned data
# df_covid.to_csv("./cleaned_data/tweets_2018_cleaned.csv", header=True, index=True)