In [2]:
import os
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [3]:
def get_texts_and_labels():
    # Initialize train data and train labels
    X = [] 
    y = []

    # Read text data from directory
    path = "review_polarity/pos"
    for fname in os.listdir(path):
            f =  open(os.path.join(path, fname), 'r') 
            content = f.read()        
            X.append(content)
            y.append("pos")

    path = "review_polarity/neg"
    for fname in os.listdir(path):
            f =  open(os.path.join(path, fname), 'r') 
            #print(f)
            content = f.read()
            X.append(content)
            y.append("neg")
    
    return X,y

In [4]:
def get_model_evals():

    # Get inputs and labels
    X,y = get_texts_and_labels()
    
    # Split into train/test set (9:1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)

    # Generate feature vectors
    vectorizer = TfidfVectorizer(min_df=5,
                                 max_df = 0.8,
                                 sublinear_tf=True,
                                 use_idf=True,
                                 decode_error='ignore')
    train_vectors = vectorizer.fit_transform(X_train)
    test_vectors = vectorizer.transform(X_test)

    # Create the details of the models 
    knn = KNeighborsClassifier(n_neighbors=10)
    rf = RandomForestClassifier(n_estimators=100)
    linsvm = svm.LinearSVC()
    
    # Fit the models and make predictions
    start = time.time()
    knn.fit(train_vectors, y_train)
    pred_knn = knn.predict(test_vectors)
    end = time.time()
    time_knn = end-start

    start = time.time()
    rf.fit(train_vectors, y_train)
    pred_rf = rf.predict(test_vectors)
    end = time.time()
    time_rf = end-start

    start = time.time()
    linsvm.fit(train_vectors, y_train)
    pred_svm = linsvm.predict(test_vectors)
    end = time.time()
    time_svm = end-start

    # Print the results
    print("--- knn ---")
    print("time: ", time_knn)
    print(classification_report(y_test, pred_knn))
    print("--- random forest ---")
    print("time: ", time_rf)
    print(classification_report(y_test, pred_rf))
    print("--- svm ---")
    print("time: ", time_svm)
    print(classification_report(y_test, pred_svm))

In [5]:
get_model_evals()

--- knn ---
time:  0.054412126541137695
              precision    recall  f1-score   support

         neg       0.76      0.76      0.76       103
         pos       0.74      0.74      0.74        97

   micro avg       0.75      0.75      0.75       200
   macro avg       0.75      0.75      0.75       200
weighted avg       0.75      0.75      0.75       200

--- random forest ---
time:  1.1160941123962402
              precision    recall  f1-score   support

         neg       0.76      0.83      0.79       103
         pos       0.80      0.72      0.76        97

   micro avg       0.78      0.78      0.78       200
   macro avg       0.78      0.77      0.77       200
weighted avg       0.78      0.78      0.77       200

--- svm ---
time:  0.042259931564331055
              precision    recall  f1-score   support

         neg       0.89      0.88      0.89       103
         pos       0.88      0.89      0.88        97

   micro avg       0.89      0.89      0.89       200


In [7]:
# Use all the data to train the chosen model        
X,y = get_texts_and_labels()
# Prepare feature vectors
vectorizer = TfidfVectorizer(min_df=5,
                             max_df=0.8,
                             sublinear_tf=True,
                             use_idf=True,
                             decode_error='ignore')
X_vectors = vectorizer.fit_transform(X)

# Train the SVM model with linear kernel
clf = svm.LinearSVC()
clf.fit(X_vectors, y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [8]:
def read_files(path, text_lst, fnames):
    for fname in os.listdir(path):
        if (fname =='.DS_Store'or fname =='.ipynb_checkpoints'):
             continue
        content = open(os.path.join(path, fname), 'rb').read().decode('utf-8', errors='ignore')
        #print(f)
        #content = f.read().decode('utf-8', errors='ignore')        
        text_lst.append(content)
        fnames.append(fname)
    return text_lst,fnames

In [9]:
dirnames = []
directory = 'News Articles'
for x in os.walk(directory):
    dirnames.append(x[0])
dirnames = dirnames[1:]
text_lst = []
fnames = []
for path in dirnames:
    text_lst, fnames = read_files(path, text_lst, fnames)

In [10]:
text_vectors = vectorizer.transform(text_lst)

In [11]:
results = clf.predict(text_vectors)

In [13]:
df = pd.DataFrame({
    "file": fnames,
    "sentiment": results,
    "text": text_lst
})

In [14]:
df.head()

Unnamed: 0,file,sentiment,text
0,116.txt,pos,SOURCE: The Wrap\r\n\r\nTITLE: POK REMIND LEAD...
1,17.txt,pos,SOURCE: The Wrap\r\n\r\nTITLE: THE END OF PROT...
2,843.txt,pos,SOURCE: The Wrap\r\n\r\nTITLE: DEMONSTRATION A...
3,704.txt,pos,SOURCE: The Wrap\r\n\r\nTITLE: A CELEBRATION O...
4,271.txt,pos,SOURCE: The Wrap\r\n\r\nTITLE: THE SHEET OF TH...


In [18]:
df[['source', 'text']] = df['text'].str.split('TITLE:', 1, expand=True)

In [19]:
df.head()

Unnamed: 0,file,sentiment,text,source
0,116.txt,pos,POK REMIND LEADER MARTYRED\r\n\r\nPUBLISHED: ...,SOURCE: The Wrap\r\n\r\n
1,17.txt,pos,THE END OF PROTESTS POK IN ARRESTS\r\n\r\n\r\...,SOURCE: The Wrap\r\n\r\n
2,843.txt,pos,DEMONSTRATION ATTRACTS THOUSANDS IN SPITE OF ...,SOURCE: The Wrap\r\n\r\n
3,704.txt,pos,A CELEBRATION OF DELIVERED KLEPTOCRACY\r\n\r\...,SOURCE: The Wrap\r\n\r\n
4,271.txt,pos,THE SHEET OF THE PUBLIC HEALTH ELODIS - UPDAT...,SOURCE: The Wrap\r\n\r\n


In [20]:
df['source'] = df['source'].str.extract('(.*(?=\r\n\r\n))')
df['source'] = df['source'].str[7:]

In [23]:
df

Unnamed: 0,file,sentiment,text,source
0,116.txt,pos,POK REMIND LEADER MARTYRED\r\n\r\nPUBLISHED: ...,The Wrap
1,17.txt,pos,THE END OF PROTESTS POK IN ARRESTS\r\n\r\n\r\...,The Wrap
2,843.txt,pos,DEMONSTRATION ATTRACTS THOUSANDS IN SPITE OF ...,The Wrap
3,704.txt,pos,A CELEBRATION OF DELIVERED KLEPTOCRACY\r\n\r\...,The Wrap
4,271.txt,pos,THE SHEET OF THE PUBLIC HEALTH ELODIS - UPDAT...,The Wrap
5,448.txt,pos,"ELODIS, KRONOS: A CITY IN NEED, part 1\r\n\r...",The Wrap
6,500.txt,pos,LACK OF THE INTENTIONS OF THE GOVERNMENT OF T...,The Wrap
7,349.txt,pos,SHEET 2002 OF THE PUBLIC HEALTH ELODIS\r\n\r\...,The Wrap
8,799.txt,pos,A CAN ON LIVING CUT TRAGICAL SHORT: ELIAN KAR...,The Wrap
9,217.txt,neg,THE COLLECTION POK DRAWS THOUSANDS OF PROTEST...,The Wrap


In [22]:
df.to_csv(r'text_data.csv', index = False)