## Reddit Feedback Extraction

This notebook 
1. Creates the classifier trained on User Voice and some manually labelled reddit data
2. Applies that classifier to unfilitered reddit data 
3. Stores the useful feedback in a file called *reddit_powerpoint_feedback.txt*

In [10]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [4]:
def lower_token(tokens): 
    return [w.lower() for w in tokens]  

In [5]:
stoplist = stopwords.words('english')
def removeStopWords(tokens): 
    return [word for word in tokens if word not in stoplist]

### Classifier: User Feedback vs. No Feedback 

In [6]:
with open("positive_training_data.txt", "r") as file_object:
    x_pos = file_object.read()
    x_pos = x_pos.split("\n")

with open("negative_training_data.txt", "r") as file_object:
    x_neg = file_object.read()
    x_neg = x_neg.split("\n")

In [8]:
tokens_pos = [word_tokenize(sen) for sen in x_pos]
tokens_neg = [word_tokenize(sen) for sen in x_neg]

lower_tokens_pos = [lower_token(token) for token in tokens_pos]
lower_tokens_neg = [lower_token(token) for token in tokens_neg]

filtered_words_pos = [removeStopWords(sen) for sen in lower_tokens_pos]
filtered_words_neg = [removeStopWords(sen) for sen in lower_tokens_neg]

X = filtered_words_pos + filtered_words_neg
Y = [1]*len(filtered_words_pos) + [0]*len(filtered_words_neg)

X = [' '.join(sen) for sen in X]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

X_train,X_test,Y_train, Y_test = train_test_split(X.toarray(), np.array(Y), test_size=0.1, random_state=42)

In [9]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)

predictions = clf.predict(X_test)

In [12]:
print(classification_report(Y_test, predictions, labels=[0,1], target_names=["Not feedback", "PPT User Feedback"]))


                   precision    recall  f1-score   support

     Not feedback       0.63      0.86      0.73        28
PPT User Feedback       0.91      0.75      0.82        56

         accuracy                           0.79        84
        macro avg       0.77      0.80      0.78        84
     weighted avg       0.82      0.79      0.79        84



### Filter Reddit Data 

In [15]:
def filter_reddit_data(file_iteration):
    filename = "unfiltered_reddit_data" + file_iteration + ".txt"
    with open(filename, "r") as file_object:
        raw_data = file_object.read()
        raw_data = raw_data.split("\n")
    
    data = [word_tokenize(sen) for sen in raw_data]
    data = [lower_token(token) for token in data]
    data = [removeStopWords(sen) for sen in data]
    data = [' '.join(sen) for sen in data]
    data = vectorizer.transform(data)
    
    predictions = clf.predict(data)
    print(predictions)
    
    with open("reddit_user_feedbback.txt", "a") as file_object:
        for i in range(0, len(predictions)):
            if predictions[i] == 1:
                file_object.write(raw_data[i]+"\n")

In [16]:
filter_reddit_data("1")

[0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0]
