In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from nltk.tag import pos_tag #詞性標註
from sklearn.model_selection import train_test_split

# Final Code #

In [43]:
# RMSE #
from math import sqrt
def rmse(pred,target):
    n=len(pred)
    total=0
    for i in range(n):
        total+=abs(pred[i]-target[i])**2
    return sqrt(total/n)

In [44]:
# # # (HashingVectorizer) Linear SVR # # #
from sklearn.feature_extraction.text import HashingVectorizer
data=pd.read_csv("training_data.csv")
test=pd.read_csv("test_data.csv")
x=list(data.iloc[:]["text"]) #only select text column
y=list(data.stars)

text_train, text_test, y_train, y_test=train_test_split(x,y,random_state=0,test_size=0.2,stratify=y)
hashing_vectorizer = HashingVectorizer(n_features=1000000,stop_words="english")
x_train = hashing_vectorizer.fit_transform(text_train) #transform text into vector type
x_test = hashing_vectorizer.transform(text_test)
ans = hashing_vectorizer.transform(test.text)

regr = LinearSVR(random_state=0) #predict method
regr.fit(x_train, y_train) #training data
pred_test=regr.predict(x_test) #test data

# test data #
pred_svr=regr.predict(ans) #predict
print("RMSE:",rmse(y_test,pred_test))

with open("predict_result.csv",'w') as file:  #write file
    for i in range(len(pred_svr)):
        line=str(test.iloc[i]["review_id"])+','+str(pred_svr[i])+'\n'
        file.write(line)

RMSE: 0.9295657059297496


In [45]:
x_temp = hashing_vectorizer.transform(x) #for the final example
pred_temp = regr.predict(x_temp) 

# Another Try #

In [13]:
### using LogisticRegression to compare the prediction of CountVectorizer and TfidfVectorizer ###
clf = LogisticRegression()

vectorizer=CountVectorizer(stop_words="english")
vectorizer.fit(text_train)
x_train=vectorizer.fit_transform(text_train)
x_test=vectorizer.transform(text_test)

clf.fit(x_train,y_train)
pred=clf.predict(x_test)
print("CountVectorizer(classification):",clf.score(x_test,y_test))
print("CountVectorizer(RMSE):",rmse(pred,y_test),"\n")

tfid_stop_vec = TfidfVectorizer(analyzer='word', stop_words='english')
x_train = tfid_stop_vec.fit_transform(text_train)
x_test = tfid_stop_vec.transform(text_test)

clf.fit(x_train,y_train)
pred2=clf.predict(x_test)
print("TfidfVectorizer(classification):",clf.score(x_test,y_test))
print("TfidfVectorizer(RMSE):",rmse(pred2,y_test))

CountVectorizer(classification): 0.49625
CountVectorizer(RMSE): 1.065657074297356 

TfidfVectorizer(classification): 0.50375
TfidfVectorizer(RMSE): 1.1757976016304847


In [14]:
### compare with LinearRegression and LinearSVR ###
text_train, text_test, y_train, y_test=train_test_split(x,y,random_state=42,test_size=0.2,stratify=y)

tfid_stop_vec = TfidfVectorizer(analyzer='word', stop_words='english')
x_train = tfid_stop_vec.fit_transform(text_train)
x_test = tfid_stop_vec.transform(text_test)
ans = tfid_stop_vec.transform(test.text)

clf = LinearRegression()
clf.fit(x_train,y_train)
pred_test=clf.predict(x_test)
print("LinearRegression(RMSE):",rmse(pred_test,y_test),"\n")

regr = LinearSVR(random_state=0) #predict method
regr.fit(x_train, y_train) #training data
pred_test=regr.predict(x_test) #test data
print("LinearSVR(RMSE):",rmse(pred_test,y_test))

LinearRegression(RMSE): 1.388854224918573 

LinearSVR(RMSE): 0.9136411125346426


## classification by part of speech ##

In [28]:
import re
import nltk
from nltk.tag import pos_tag
from nltk.corpus import brown

def tag(string): #mark the part of speech
        return nltk.pos_tag(string)
    
def text_convert(read,write,if_stars):
    #for five items then next to the another part of speech
    data=pd.read_csv(read)
    pos_list=["JJ","JJR","JJS",0,0,\
                "NN","NNP","NNS",0,0,\
                "NNPS","RB","RBR", "RBS",0,\
                "VB","VBG","VBN","VBP","VBZ"] 
    pos_=["ADJ","NOUN","ADV","VERB"]
    total={"ADJ":[],"NOUN":[],"ADV":[],"VERB":[]}

    for i in range(len(data)):
        pat = '[a-zA-Z]+' #only alphabet can be read(there are seveal letters from Greek alphabet)
        text=data.iloc[i]["text"]

        text=text.replace('"',' ') #excluding the sign of "
        bag_of_words=re.findall(pat, text)  ###以list形式返回所有匹配pat([a-zA-Z]+)模式的項
        while '' in bag_of_words: #neglect blanks in the list ('' can not use pos_tag function)
            del bag_of_words[bag_of_words.index('')]

        pos=tag(bag_of_words) #mark the part of speech with bag_of_words (type:list)

        words=pd.DataFrame({"word":bag_of_words})

        pos1=pd.DataFrame(pos)[1] #part of speech(type:DataFrame)
        # rename the column #
        df=pd.concat([words,pos1],axis=1)
        df["pos"]=df[1]
        del df[1]
        pos_tag,counts=list(pd.value_counts(df["pos"]).keys()),list(pd.value_counts(df["pos"]))

        for num in range(len(total)): #counts the numbers of the certain part of speech
            count=0
            for item in range(len(pos_tag)):
                if pos_tag[item] in pos_list[5*num:5*num+5]: #classify and count the part of speech 
                    count+=counts[pos_tag.index(pos_tag[item])] #sum up the similar pos_tag result
            total[pos_[num]].append(count)
    total=pd.DataFrame(total)
    total["review_id"]=data["review_id"]
    if if_stars:
        total["stars"]=data["stars"]
    total.to_csv(write,index=False, encoding='utf8')
    print("finish.")

text_convert("test_data.csv","test(0607).csv",False)
text_convert("training_data.csv","train(0607).csv",True)

finish.
finish.


In [47]:
pos_data=pd.read_csv("train(0607).csv")
pos_data_test=pd.read_csv("test(0607).csv")

merge=["review_id","stars"]
merge_=["ADJ","NOUN","VERB","ADV"]
## merge both of them ##
df=pd.concat([data[merge][:],pos_data[merge_][:]],axis=1)
# add prediction values to strengthen the star level trendency #
temp_df=pd.DataFrame({"pred":pred_temp})
df=pd.concat([df,temp_df],axis=1)
x2=df[["ADJ","NOUN","VERB","ADV","pred"]] #features
y2=pos_data["stars"]
x_train2, x_test2, y_train2, y_test2=train_test_split(x2,y2,random_state=42,test_size=0.2,stratify=y2)
regr = LinearSVR(random_state=0)
regr.fit(x_train2, y_train2)
pred_test2 = regr.predict(x_test2)
final_ans = regr.predict(ans2)

print("LinearSVR(RMSE):",rmse(list(y_test2),pred_test2))

LinearSVR(RMSE): 0.724238250743993


In [48]:
clf = LinearRegression()
clf.fit(x_train2,y_train2)
pred_test2=clf.predict(x_test2)
print("LinearRegression(RMSE):",rmse(pred_test2,list(y_test2)))

LinearRegression(RMSE): 0.7007463454222245
