-
Notifications
You must be signed in to change notification settings - Fork 0
/
StackExchangeClassifier.py
134 lines (86 loc) · 3.11 KB
/
StackExchangeClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Stack Exchange Question Classifier
import pandas as pd
import numpy as np
import json
import codecs
def read_dataset(path):
with codecs.open(path, 'r', 'utf-8') as myFile:
content = myFile.read()
dataset = json.loads(content)
#dataset = pd.read_table(content)
return dataset
train = pd.read_table("input00.txt", header = 0, delimiter = "\t", quoting = 3)
label = open("output00.txt", 'r')
test_num_classes = []
for line in label:
test_num_classes.append(str(line))
test_indices = np.random.random_integers(0, train.shape[0]-1, 1000)
train_indices = set(range(train.shape[0])).difference(set(test_indices))
Train = train.ix[train_indices]
Train_classes = []
for index in train_indices:
Train_classes.append(test_num_classes[index])
Test = train.ix[test_indices]
Test_classes = []
for index in test_indices:
Test_classes.append(test_num_classes[index])
from bs4 import BeautifulSoup
example1 = BeautifulSoup(str(Train.ix[0]))
import re
letters_only = re.sub("[^a-zA-Z]", " ", example1.get_text())
# [] indicates group membership and ^ means "not". In other words, the re.sub() statement above says, "Find anything that is NOT a lowercase letter (a-z) or an upper case letter (A-Z), and replace it with a space."
lower_case = letters_only.lower()
words = lower_case.split()
import nltk
#nltk.download()
# look into some NLP documentation
from nltk.corpus import stopwords # import the stop words list
print(stopwords.words("english"))
def review_to_words(raw_review):
review_text = BeautifulSoup(str(raw_review)).get_text()
letters_only = re.sub("^[a-zA-Z]", " ", review_text)
words = letters_only.lower().split()
stops = set(stopwords.words("english"))
meaningful_words = [w for w in words if not w in stops]
return(" ".join(meaningful_words))
true_words = []
for i in train_indices:
true_words.append(review_to_words(Train.ix[i].values))
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None,
preprocessor = None, stop_words = None, max_features = 6000)
train_data_features = vectorizer.fit_transform(true_words)
train_data_features = train_data_features.toarray()
vocab = vectorizer.get_feature_names()
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
num_classes = []
for x in Train_classes:
if x == 'electronics':
num_classes.append(0)
elif x == 'mathematica':
num_classes.append(1)
elif x == 'android':
num_classes.append(2)
elif x == 'security':
num_classes.append(3)
elif x == 'gis':
num_classes.append(4)
elif x == 'photo':
num_classes.append(5)
elif x == 'scifi':
num_classes.append(6)
elif x == 'unix':
num_classes.append(7)
elif x == 'apple':
num_classes.append(8)
else:
num_classes.append(9)
classifier = rfc.fit(train_data_features, num_classes)
test_words = []
for i in test_indices:
test_words.append(review_to_words(Test.ix[i].values))
test_data_features = vectorizer.fit_transform(test_words)
test_data_features = test_data_features.toarray()
result = classifier.predict(test_data_features)
result2 = classifier.predict(train_data_features)