In [1]:
import csv
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [24]:
#creating the training document
def create_csv_file(filename, train=True):
    header = ['row_number', 'text', 'polarity']
    if train == True:
        path_to_pos = "aclImdb/train/pos/"
        path_to_neg = "aclImdb/train/neg/"
    else:
        path_to_pos = "aclImdb/test/pos/"
        path_to_neg = "aclImdb/test/neg/"
        
    count = 0

    with open(filename, 'w', newline='') as f1:
            writer = csv.writer(f1, delimiter=',')
            writer.writerow(header)
            for f in os.listdir(path_to_pos):
                polarity = 1
                if f.endswith(".txt"):
                    open_file = open(path_to_pos+f, "r")
                    data = open_file.read()
                    writer.writerow([count,f'"{data}"',polarity])
                    count += 1
                    open_file.close()
            for f in os.listdir(path_to_neg):
                polarity = 0
                if f.endswith(".txt"):
                    open_file = open(path_to_neg+f, "r")
                    data = open_file.read()
                    writer.writerow([count,f'"{data}"',polarity])
                    count += 1
                    open_file.close()

-----

In [2]:
def create_list_of_docs(filename):
    docs = []
    polarity = []
    with open(filename,'r') as csvfile: # input csv file
            reader = csv.reader(csvfile, delimiter=',')
            next(reader, None)
            for row in tqdm(reader):
                docs.append(row[1])
                polarity.append(row[2])
    return docs, polarity

In [3]:
def remove_special_chars (strs):
    strs = re.sub(r'<.*?>','', strs)
    strs = re.sub(r'[^a-zA-Z. ]','',strs) 
    
    return strs

In [5]:
def data_preprocess(docs):
    for index, row in tqdm(enumerate(docs)):
#         sleep(0)
        docs[index] = remove_special_chars(row)
    return docs

---

In [None]:
create_csv_file("imdb_tr.csv", train=True)

In [None]:
create_csv_file("imdb_te.csv", train=False)

In [6]:
docs_train, y_train = create_list_of_docs("imdb_tr.csv")

25000it [00:00, 49796.25it/s]


In [7]:
docs_test, y_test_true = create_list_of_docs("imdb_te.csv")

25000it [00:00, 49190.72it/s]


In [8]:
docs_train = data_preprocess(docs_train)

25000it [00:00, 50838.82it/s]


In [9]:
docs_test = data_preprocess(docs_test)

25000it [00:00, 52591.39it/s]


In [11]:
y_train = list(map(int, y_train))

In [12]:
y_test_true = list(map(int, y_test_true))

In [13]:
def ngram_classifier (docs_train, y_train, docs_test, ngram_range, tfidf):
    
    if tfidf==True:
        tfidfvec = TfidfVectorizer(stop_words="english",
                                  analyzer = 'word',
                                  lowercase=True,
                                  use_idf=True,
                                  ngram_range=ngram_range)
        
        X_train = tfidfvec.fit_transform(docs_train)
    
        X_test = tfidfvec.transform(docs_test)
        
    else:
        cvec = CountVectorizer(stop_words="english",
                               analyzer = 'word',
                               lowercase=True,
                               ngram_range=ngram_range)
    
        X_train = cvec.fit_transform(docs_train)
        
        X_test = cvec.transform(docs_test)
        
    
    
    clf = SGDClassifier(loss="hinge", penalty="l1")
    
    clf.fit(X_train, y_train)
    
    prediction = clf.predict(X_test)
    
    
    return prediction


In [14]:
y_pred_unigram = ngram_classifier(docs_train, y_train, docs_test, (1,1), tfidf = False)

In [15]:
accuracy_score(y_test_true, y_pred_unigram)

0.83248

In [16]:
y_pred_bigram = ngram_classifier(docs_train, 
                                 y_train, 
                                 docs_test, 
                                 (1,2), 
                                 tfidf = False)

In [17]:
accuracy_score(y_test_true, y_pred_bigram)

0.83968

In [19]:
y_pred_unigram_tfidf = ngram_classifier(docs_train, 
                                        y_train, 
                                        docs_test, 
                                        (1,1), 
                                        tfidf = True)

In [20]:
accuracy_score(y_test_true, y_pred_unigram_tfidf)

0.86948

In [21]:
y_pred_bigram_tfidf = ngram_classifier(docs_train, 
                                       y_train, 
                                       docs_test, 
                                       (1,2), 
                                       tfidf = True)

In [22]:
accuracy_score(y_test_true, y_pred_bigram_tfidf)

0.85772