<a href="https://colab.research.google.com/github/chekhovana/courses/blob/main/ml_stepik/6_final_project/week4_online_app/model/imdb_data_loading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Install and import libraries

In [None]:
import re
import os
import random
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from urllib import request
import tarfile
import nltk

##Load dataset

In [None]:
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
res = request.urlretrieve(url, "imdb.tar.gz")
with tarfile.open(res[0], "r:gz") as tar:
    tar.extractall()

In [None]:
def load_reviews(folder):
    files = os.listdir(folder)
    reviews = []
    for fn in tqdm(files):
        with open(os.path.join(folder, fn)) as f:
            reviews.append(f.read())
    return reviews

def load_dataset(folder):
    dataset = []
    for class_label, subfolder in [(1, 'pos'), (0, 'neg')]:
        reviews = load_reviews(os.path.join(folder, subfolder))
        reviews = np.array([reviews, [class_label] * len(reviews)]).T
        dataset.append(reviews)
    dataset = np.vstack(dataset)
    np.random.shuffle(dataset)
    return dataset[:, 0], dataset[:, 1]

np.random.seed(42)

##Preprocess dataset

Data preprocessing includes the following steps:

*   convert to lower case
*   remove html tags
*   remove punctuation
*   remove stop words
*   perform lemmatization


In [None]:
%%time
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = nltk.corpus.stopwords.words('english')

def preprocess(x):
    x = x.lower()
    x = re.sub('<.*?>', ' ', x)
    x = re.sub('[^A-z\s]', '', x)
    words = x.split()

    #remove stop words
    words = [w for w in words if w not in stop_words]

    #perform stemming - commented out, stemming seems to be irrelevant
    # ps = nltk.stem.porter.PorterStemmer()
    # words = [ps.stem(word) for word in words]                

    #perform lemmatization
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    words = [lem.lemmatize(word) for word in words]
    return ' '.join(words)

def preprocess_dataset(x):
    return np.array(list(map(preprocess, x)))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
CPU times: user 38.1 ms, sys: 5.31 ms, total: 43.4 ms
Wall time: 87.7 ms


In [None]:
x_train, y_train = load_dataset('aclImdb/train')
x_train = preprocess_dataset(x_train)

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [None]:
x_test, y_test = load_dataset('aclImdb/test')
x_test = preprocess_dataset(x_test)

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

##Save dataset for future use

In [None]:
x_total = np.hstack((x_train, x_test))
y_total = np.hstack((y_train, y_test))
df = pd.DataFrame({'review': x_total, 'label': y_total})
df.to_csv('imdb_preprocessed.csv', index=False)