In [1]:
import os
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import timeit
import scattertext as st
import collections
from IPython.display import HTML, IFrame
from textblob import TextBlob
from w3lib.html import remove_tags
from wordcloud import WordCloud
from tqdm import tqdm_notebook
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import KernelPCA

In [5]:
def load_data(path, file_list, dataset, encoding='utf8'):
    """Read set of files from given directory and save returned lines to list.
    
    Parameters
    ----------
    path : str
        Absolute or relative path to given file (or set of files).
    file_list: list
        List of files names to read.
    dataset: list
        List that stores read lines.
    encoding: str, optional (default='utf8')
        File encoding.
        
    """
    for file in file_list:
        with open(os.path.join(path, file), 'r', encoding=encoding) as text:
            dataset.append(text.read())

In [7]:
# Path to dataset location
path = 'aclImdb/'

# Create lists that will contain read lines
train_pos, train_neg, test_pos, test_neg = [], [], [], []

# Create a dictionary of paths and lists that store lines (key: value = path: list)
sets_dict = {'train/pos/': train_pos, 'train/neg/': train_neg,
             'test/pos/': test_pos, 'test/neg/': test_neg}

# Load the data
for dataset in sets_dict:
        file_list = [f for f in os.listdir(os.path.join(path, dataset)) if f.endswith('.txt')]
        load_data(os.path.join(path, dataset), file_list, sets_dict[dataset])

In [8]:
# Concatenate training and testing examples into one dataset
dataset = pd.concat([pd.DataFrame({'review': train_pos, 'label':1}),
                     pd.DataFrame({'review': test_pos, 'label':1}),
                     pd.DataFrame({'review': train_neg, 'label':0}),
                     pd.DataFrame({'review': test_neg, 'label':0})],
                     axis=0, ignore_index=True)

In [9]:
# Concatenate training and testing examples into one dataset
training_dataset = pd.concat([pd.DataFrame({'review': train_pos, 'label':1}),
                     pd.DataFrame({'review': train_neg, 'label':0})],
                     axis=0, ignore_index=True)
testing_dataset  = pd.concat([pd.DataFrame({'review': test_pos, 'label':1}),
                     pd.DataFrame({'review': test_neg, 'label':0})],
                     axis=0, ignore_index=True)

In [10]:
dataset.to_csv('dataset.csv',index=False)

In [12]:
training_dataset.to_csv('train_dataset.csv',index=False)
testing_dataset.to_csv('test_dataset.csv',index=False)

In [13]:
dataset.head()

Unnamed: 0,review,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [14]:
dataset.tail()

Unnamed: 0,review,label
49995,I occasionally let my kids watch this garbage ...,0
49996,When all we have anymore is pretty much realit...,0
49997,The basic genre is a thriller intercut with an...,0
49998,Four things intrigued me as to this film - fir...,0
49999,David Bryce's comments nearby are exceptionall...,0


In [15]:
dataset.label.value_counts()

1    25000
0    25000
Name: label, dtype: int64

In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review    50000 non-null object
label     50000 non-null int64
dtypes: int64(1), object(1)
memory usage: 781.3+ KB


In [17]:
dataset.isna().sum()

review    0
label     0
dtype: int64

In [18]:
# Get indices of duplicate data (excluding first occurrence)
duplicate_indices = dataset.loc[dataset.duplicated(keep='first')].index

# Count and print the number of duplicates
print('Number of duplicates in the dataset: {}'.format(dataset.loc[duplicate_indices, 'review'].count()))

Number of duplicates in the dataset: 418


In [19]:
# Show some of the duplicates
dataset.loc[duplicate_indices, :].head()

Unnamed: 0,review,label
197,Though structured totally different from the b...,1
1633,Everyone knows about this ''Zero Day'' event. ...,1
2136,One of Disney's best films that I can enjoy wa...,1
2843,I Enjoyed Watching This Well Acted Movie Very ...,1
3119,Smallville episode Justice is the best episode...,1


In [20]:
HTML(dataset.iloc[np.random.randint(dataset.shape[0]), 0])

In [2]:
from sklearn.model_selection import train_test_split

In [22]:
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

In [23]:
train,valid = train_test_split(train_data,stratify=train_data.label,random_state=2020,test_size=0.3)

In [24]:
train.to_csv('train.csv',index=False)
valid.to_csv('valid.csv',index=False)