-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
66 lines (58 loc) · 2.38 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 7 23:02:20 2019
@author: Abhishek's PC
"""
from flask import Flask, render_template, request
import pandas as pd
import nltk
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter
import re
import pickle
import praw
import pprint
#nltk.download('punkt')
#nltk.download('stopwords')
reddit = praw.Reddit(client_id='300HIOocldVcKA', client_secret='PCktLUhpPaRB1RrBCouEDPdpEBU', user_agent='abhishek chopra') # potentially needs configuring, see docs
def preProcessData(dataa):
stopwords_en = list(set(stopwords.words('english')))
def split(word):
return [char for char in word]
punchList = split(punctuation)
wordTokenList = [word_tokenize(sent) for sent in dataa]
lowercasingList = [[word.lower() for word in sentence] for sentence in wordTokenList]
noStopWordList = [[word for word in sentence if word not in stopwords_en] for sentence in lowercasingList]
noPunchList = [[re.sub(r'([^\s\w]|_)+', '', word) for word in sentence] for sentence in noStopWordList]
#noPunchList = [[word for word in sentence if word not in punchList] for sentence in noStopWordList]
PP_data = [[word for word in sentence if word] for sentence in noPunchList]
return PP_data
def text_extractor(text,text_type):
title_list=[]
for i in range(len(text)):
title_list.append(text[text_type][i])
return title_list
def joiner(data):
input_corrected = [" ".join(i) for i in data]
return input_corrected
def detect_flair(url,loaded_model):
submission = reddit.submission(url=url)
topics_dict = {"title":[], "comments":[]}
topics_dict["title"].append(submission.title)
submission.comments.replace_more(limit=None)
comment = ''
for top_level_comment in submission.comments:
comment = comment + ' ' + top_level_comment.body
topics_dict["comments"].append(comment)
topics_data = pd.DataFrame(topics_dict)
feature_combine = topics_data["title"] + topics_data["comments"]
topics_data = topics_data.assign(feature_combine = feature_combine)
feature=text_extractor(topics_data,'feature_combine')
x=joiner(preProcessData(feature))
flair = submission.link_flair_text
return (loaded_model.predict(x), flair)
filename = 'rfr_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))