# Data Science for Social Justice Workshop: Preprocessing – PROJECT

## Reading the Data

Put your data in the `data` folder of this repo and replace `YOUR_FILE.csv` below with the name of your file.

In [None]:
# Import the pandas package
import pandas as pd 

# Read the csv file
df = pd.read_csv('../../data/YOUR_FILE.csv')

Check out the shape, first rows, and columns.

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# This allows you to quickly see which columns you have
list(df)

## Removing columns and rows

In [None]:
# Drop some columns
df = df.drop(['self', 'url', 'subreddit', 'augmented_at', 'augmented_count'], axis=1)

In [None]:
# Select rows that don't have 'removed' or 'deleted' as the selftext
df = df.loc[~df['selftext'].isin(['[removed]', '[deleted]' ]),:]
df.shape

In [None]:
# Drop null values in selftext
df = df.dropna(subset=['selftext'])
df.shape

## Preprocessing Data with Spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
from gensim.models.phrases import Phrases, Phraser

In [None]:
def clean(token):
    """Helper function that specifies whether a token is:
        - punctuation
        - space
        - digit
    """
    return token.is_punct or token.is_space or token.is_digit

def line_read(df, text_col='selftext'):
    """Generator function to read in text from df and get rid of line breaks."""    
    for text in df[text_col]:
        yield text.replace('\n', '')

def preprocess(df, text_col='selftext', allowed_postags=['NOUN', 'ADJ']):
    """Preprocessing function to apply to a dataframe."""
    for parsed in nlp.pipe(line_read(df, text_col), batch_size=1000, disable=["tok2vec", "ner"]):
        # Gather lowercased, lemmatized tokens
        tokens = [token.lemma_.lower() if token.lemma_ != '-PRON-'
                  else token.lower_ 
                  for token in parsed if not clean(token)]
        # Remove specific lemmatizations, and words that are not nouns or adjectives
        tokens = [lemma
                  for lemma in tokens
                  if not lemma in ["'s",  "’s", "’"] and not lemma in allowed_postags]
        # Remove stop words
        tokens = [token for token in tokens if token not in spacy.lang.en.stop_words.STOP_WORDS]
        yield tokens

In [None]:
# This may take a while
lemmas = [line for line in preprocess(df)]

## Phrase modeling

In [None]:
from gensim.models.phrases import Phrases, Phraser

# Create bigram and trigram models
bigram = Phrases(lemmas, min_count=10, threshold=100)
trigram = Phrases(bigram[lemmas], min_count=10, threshold=50)  
bigram_phraser = Phraser(bigram)
trigram_phraser = Phraser(trigram)

# Form trigrams
trigrams = [trigram_phraser[bigram_phraser[doc]] for doc in lemmas]

In [None]:
# Join each into a string
trigrams_joined = [' '.join(trigram) for trigram in trigrams]
trigrams_joined[0]

Check how many bigrams were identified by the parser.

In [None]:
len(bigram_phraser.phrasegrams.keys())

Print the first few bigrams identified in the model to check if they seem  appropriate. If not, you can play around with the parameters of the bigram model to adjust the sensitivity of the model (the values for `min_count` and `threshold` above).

In [None]:
list(bigram_phraser.phrasegrams.keys())[:10]

In [None]:
# Look at trigrams
[trigram for trigram in list(trigram_phraser.phrasegrams.keys()) if trigram.count('_') == 2]

## Saving data

Add the new preprocessed data to our .csv in a new column.

In [None]:
# Inserting next to selftext column
df.insert(loc=7, column='lemmas', value=trigrams_joined)
# Removing empty rows in lemmas
df = df[~df['lemmas'].isin([''])]

Change `YOUR_FILE` below to the name of your dataset.

In [None]:
# Save to new csv
df.to_csv('YOUR_FILE_lemmas.csv', index=False)