In [None]:
import nltk
import os
import pandas as pd
import re
import spacy

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pyprojroot import here
from string import punctuation

In [None]:
# Use here to create the filepath
text_path = here('data/sowing_and_reaping.txt')

# Open and read the text
with open(text_path, 'r') as file:
    raw_text = file.read()
    
# Remove the front and end matter
sowing_and_reaping = raw_text[1114:684814]

In [None]:
# Import pandas
import pandas as pd
# Use pandas to import Tweets
csv_path = here('data/airline_tweets.csv')
tweets = pd.read_csv(csv_path, sep=',')

In [None]:
example2_path = here('data/example2.txt')

with open(example2_path) as file:
    example2 = file.read()

## Challenge 1: Working with `string`s

* What type of object is `sowing_and_reaping`?
* How many characters are in `sowing_and_reaping`?
* How can we get the first 1000 characters of `sowing_and_reaping`?

In [None]:
type(sowing_and_reaping)

In [None]:
len(sowing_and_reaping)

In [None]:
sowing_and_reaping[:1000]

## Challenge 2: Reading in Many Files

The `data` folder contains another folder called `amazon`, which contains many `csv` files of Amazon reviews. Use a `for` loop to read in each dataframe. Do the following:

* We've provided a path to the `amazon` folder, and a list of all the file names within the folder using the `os.listdir()` function.
* Iterate over all these files, and import them using `pd.read_csv()`. You will need to use `os.path.join()` to create the correct path. Additionally, you need to provide `pandas` with the column names since they are not included in the reviews. We have create the `column_names` variable for you.
* Extract the text column from each dataframe, and add then to the `reviews` list. 
* How many totals reviews do you obtain?

In [None]:
# The os package has useful tools for file manipulation
import os
# Amazon review folder
amazon_path = here('data/amazon')
# List all the files in the amazon folder
files = os.listdir(amazon_path)
# Column names for each file
column_names = ['id',
                'product_id',
                'user_id',
                'profile_name',
                'helpfulness_num',
                'helpfulness_denom',
                'score',
                'time',
                'summary',
                'text']
# Add each review text to this list
reviews = []

In [None]:
for file in files:
    # YOUR CODE HERE
    full_path = os.path.join(amazon_path, file)
    reviews_df = pd.read_csv(full_path, sep=',', names=column_names) 
    text = list(reviews_df['text'])
    reviews.extend(text)

## Challenge 3: Text Cleaning with Multiple Steps

In Challenge 1, we imported many Amazon reviews, and stored them in a variable called `reviews`. Each element of the list is a string, representing the text of a single review. For each review:

* Strip all blank space
* Make all characters lower case
* Replace any URLs and digits

In [None]:
def preprocess(text):
    """Preprocesses a string."""
    # Lowercase
    text = text.lower()
    # Replace URLs
    url_pattern = r'https?:\/\/.*[\r\n]*'
    url_repl = ' URL '
    text = re.sub(url_pattern, url_repl, text)
    # Replace digits
    digit_pattern = '\d+'
    digit_repl = ' DIGIT '
    text = re.sub(digit_pattern, digit_repl, text)
    # Remove blank spaces
    blankspace_pattern = r'\s+'
    blankspace_repl = ' '
    text = re.sub(blankspace_pattern, blankspace_repl, text)
    # Last step: strip
    return text.strip()

In [None]:
processed_reviews = [preprocess(review) for review in reviews]

In [None]:
print(processed_reviews[0])

## Challenge 4: Tokenizing a Large Text

Tokenize "Sowing and Reaping", which we imported at the beginning of this workshop. Use a method of your choice.

Once you've tokenized, find all the unique words types (you might want the `set` function). Then, sort the resulting `set` object to create a vocabulary (you might want to use the `sorted` function).

In [None]:
# nltk
tokens = word_tokenize(sowing_and_reaping)

In [None]:
# spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(sowing_and_reaping)
tokens = [token.text for token in doc]

In [None]:
unique_tokens = set(tokens)
sorted_tokens = sorted(unique_tokens)

In [None]:
print(sorted_tokens[:100])

In [None]:
print(sorted_tokens[-100:])

## Challenge 5: Apply a Lemmatizer to Text

Lemmatize the tokenized `example2` text using the `nltk`'s `WordNetLemmatizer`.

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
tokens = word_tokenize(example2)

In [None]:
lemmatized = [lemmatizer.lemmatize(token) for token in tokens]

In [None]:
print(lemmatized)

In [None]:
print(example2)

## Challenge 6: Putting it All Together

Write a function called `preprocess()` that accepts a string and performs the following preprocessing steps:

* Strip whitespace.
* Lowercase text.
* Tokenize.
* Replace all URLs and numbers with the token "DIGIT".
* Remove punctuation.
* Remove stop words.
* Lemmatize the tokens.

Apply this function to `sowing_and_reaping`.

In [None]:
def preprocess(text):
    """Preprocesses a string."""
    # Lowercase
    text = text.lower()
    # Replace URLs
    url_pattern = r'https?:\/\/.*[\r\n]*'
    url_repl = ' URL '
    text = re.sub(url_pattern, url_repl, text)
    # Replace digits
    digit_pattern = '\d+'
    digit_repl = ' DIGIT '
    text = re.sub(digit_pattern, digit_repl, text)
    # Remove blank spaces
    blankspace_pattern = r'\s+'
    blankspace_repl = ' '
    text = re.sub(blankspace_pattern, blankspace_repl, text).strip()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove punctuation
    tokens = [token for token in tokens if token not in punctuation]
    # Remove stop words
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [None]:
processed = preprocess(sowing_and_reaping)