### 6. Text preprocessing

I'm going to use NLTK for the preprocessing of the summaries:

- Lowercasing: Convert all text to lowercase to maintain consistency.
- Tokenization: Split the text into individual words (tokens).
- Removing stop words.
- Lemmatization or stemming: reduce words to their base or root form to normalize variations.
- Removing special characters and numbers.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

import seaborn as sns
sns.set_context('poster')
sns.set(rc={'figure.figsize': (16., 9.)})
sns.set_style('whitegrid')
import seaborn as sns
import numpy as np
import re
import os

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
five = pd.read_csv("/Users/usuari/Desktop/Ironhack/BOOTCAMP/projects/final_project/data/five.csv")
five.head(3)

Unnamed: 0,title,summary,genre
0,A Clockwork Orange,"Alex, a teenager living in near-future Englan...",science fiction
1,The Plague,The text of The Plague is divided into five p...,literary fiction
2,All Quiet on the Western Front,"The book tells the story of Paul Bäumer, a Ge...",literary fiction


In [3]:
five.shape

(11013, 3)

In [33]:
import nltk 
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocessing_1(five):
    
    for index, row in five.iterrows():
        text = row['summary']
        
        # Lowercasing
        text = text.lower()
        
        # Tokenization
        tokens = word_tokenize(text)
        
        # Removing stop words
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
        
        # Removing special characters and numbers
        clean_tokens = [re.sub(r'[^a-zA-Z]', '', word) for word in lemmatized_tokens]
        
        # Update the 'cleaned_summary' column with the preprocessed text
        five.at[index, 'cleaned_summary'] = ' '.join(clean_tokens)
        five['cleaned_summary'] = five['cleaned_summary'].astype(str)

[nltk_data] Downloading package punkt to /Users/usuari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/usuari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/usuari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
preprocessing_1(five)

In [35]:
five.head()

Unnamed: 0,title,summary,genre,cleaned_summary
0,A Clockwork Orange,"Alex, a teenager living in near-future Englan...",science fiction,alex teenager living nearfuture england lead...
1,The Plague,The text of The Plague is divided into five p...,literary fiction,text plague divided five part town oran thou...
2,All Quiet on the Western Front,"The book tells the story of Paul Bäumer, a Ge...",literary fiction,book tell story paul bumer german soldier who...
3,A Wizard of Earthsea,"Ged is a young boy on Gont, one of the larger...",fantasy,ged young boy gont one larger island north ar...
4,Blade Runner 3: Replicant Night,"Living on Mars, Deckard is acting as a consul...",science fiction,living mar deckard acting consultant movie cr...


PLUS:

1) NER: name entity recognition

2) Add a column with the len of each summary, so that then I can do a groupby and plot a histogram for each genre. 

3) Add a column with the len of unique words of each summary, so that I can do a value_counts of which genre has more unique words 
(probably the fantasy genre). 


In [17]:
def preprocessing_2(five):
    for index, row in five.iterrows():
        text = row['summary']
        
        # Lowercasing if it's not an abbreviation.
        if re.match('([A-Z]+[a-z]*){2,}', text):
            text = text
        else:
            text = text.lower() 
        
        # Tokenization
        tokens = word_tokenize(text)
        
        # Removing stop words
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
        
        # Removing special characters and numbers
        clean_tokens = [re.sub(r'[^a-zA-Z]', '', word) for word in lemmatized_tokens]
        
        # Update the 'cleaned_summary' column with the preprocessed text
        five.at[index, 'cleaned_summary'] = ' '.join(clean_tokens)