In [1]:
import numpy as np
import pandas as pd
import os
import shutil
import json
import gensim
from gensim import corpora, models
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
def load_subreddits(file_name):
    ''' load text data into a df
    '''
    # Initialize lists to hold the subreddit names and text contents
    subreddits = []
    texts = []

    # Open the file and read each line
    with open(file_name, 'r') as file:

        for line in file:
            # Split each line by the first " X " to separate the subreddit and text
            parts = line.split(" X ", maxsplit=1)

            if len(parts) == 2:  # Ensure the line contains both subreddit and text
                subreddit, text = parts
                # Append the subreddit and text to their respective lists
                subreddits.append(subreddit)
                texts.append(text)
            else:
                print(f"Skipping line: {line}")

    # Create a pandas DataFrame with the collected data
    df = pd.DataFrame({
        'subreddit': subreddits,
        'text': texts
    })

    return df

In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 34.4 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
# lemmatizer
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\liuq34\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# stopwords
stop_words = set(stopwords.words('english'))
with open('../../data/en.txt', 'r') as file:
    for line in file:
        stop_words.add(line.strip())  # add additional stop words

In [17]:
# lowercase, de-accents, tokenize
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

# remove english stopwords
def remove_stopwords(sentences):
    return [[word for word in sentence if word not in stop_words] for sentence in sentences]

# lemmatize
def lemmatize(sentences, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in sentences:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def clean_text(df):
    '''
    clean and process the text df into list of lists (of tokens)
    '''
    tokenized_text = list(sent_to_words(df.text.to_list()))
    clean_text = remove_stopwords(tokenized_text)
    processed_text = lemmatize(clean_text)
    
    return processed_text

In [18]:
def add_ngrams(text_list):
    '''
    add bigrams and trigrams to processed tokens
    '''

    bigram = gensim.models.Phrases(processed_text, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[processed_text], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    processed_ngrams = [trigram_mod[bigram_mod[token]] for token in processed_text]
    
    return processed_ngrams

In [None]:
def save_processed_text(text_list):
    '''
    save processed documents to txt file
    '''
    with open('../../data/train_clean.txt', 'w', encoding='utf-8') as file:
        for doc in text_list:
            document_text = ' '.join(doc)
            file.write(document_text + '\n')