In [103]:
# Author: Christine Nguyen
import pandas as pd
import numpy as np
import pickle 
import jieba
import jieba.analyse
import re #re stands for regular expression

from nltk.corpus import stopwords

In [102]:
# pd.set_option('display.max_rows', 500)
# xiahua = pd.read_pickle("xiahua.pickle")

# Pickle stuff 

In [None]:
#why we pickle stuff?? 
#We do this so that we don't need to constantly read the text file and put the data into the dataframe. 
#This saves us time running that code, and we can pick up where we left off with the dataframe

In [2]:
#code to read/open up a pickled file

# xiahua = pd.read_pickle("xiahua.pickle")

In [104]:
#code to write to a file

# xiahua.to_pickle("xiahua.pickle")
# xiahua.tail()

## Read in data 

In [158]:
contents = [] #empty python list

In [159]:
#parse line by line, and strip contents in line 
with open('pre-discussion-4125.txt', encoding='utf8') as f:
    for line in f:
        line = line.strip() #strip line
        stuff = [x.strip() for x in line.split("\t")] #extract contents in line, split by tab
        contents.append(stuff) #insert into comprehensive list
    f.close() #close reading 

In [107]:
contents #shows everything in the contents list

[['项目id', '第几条主评论', '日期', '是否是回复', '评论者id', '评论文本', '众筹前第几天'],
 ['3510',
  '152',
  '2015/5/6',
  '0',
  'gaodapk',
  '一开始我以为是你们帮设计表白图案那，我想买一件那种把自己设计的图案画到衣服上，就是用丙烯画那种。我有自己初步草图不知UI设计师能否帮完善 再画到衣服上哪？',
  '-1'],
 ['3510',
  '154',
  '2015/5/6',
  '0',
  '158O1319OO6',
  '我感觉就是UI设计师在拼颜值啊~谁看好他就支持他，颜值高支持多，我有点想支持了。。。',
  '-1'],
 ['3510', '156', '2015/5/5', '0', '逆战游戏玩家', '什么时候开卖[疑问]', '-2'],
 ['3722', '106', '2015/4/27', '0', 'jove0916', '几点开始啊', '-1'],
 ['3722', '108', '2015/4/27', '0', '有Bigger', '具体什么时间开始呢？', '-1'],
 ['3722', '109', '2015/4/27', '0', 'jove0916', '什么时候开卖', '-1'],
 ['3722', '110', '2015/4/27', '0', 'jove0916', '想问下是什么材质，PC？', '-1'],
 ['3722',
  '111',
  '2015/4/27',
  '0',
  'cnhkygmm',
  '外出旅游拍照时，站在景点拍摄，如何放支架？似乎这个东东只适用于有平台可以放支架的场合，外出旅游拍全家福还是得用自拍杆。',
  '-1'],
 ['3722',
  '112',
  '2015/4/27',
  '0',
  '大***爰',
  '想起过年拍全家福用卷纸挡住机身的狼狈，真是很实用的东西，款式看似简单，但直接了当没有多余的设计[真棒]，看电影也能当支撑架，赞',
  '-1'],
 ['3722', '113', '2015/4/27', '0', '2***4', '[吻][吻]棒棒的', '-1'],
 ['3722',
  '114',
  '2015

In [160]:
np.asarray(contents) #convert list to numpy array

array([list(['项目id', '第几条主评论', '日期', '是否是回复', '评论者id', '评论文本', '众筹前第几天']),
       list(['3510', '152', '2015/5/6', '0', 'gaodapk', '一开始我以为是你们帮设计表白图案那，我想买一件那种把自己设计的图案画到衣服上，就是用丙烯画那种。我有自己初步草图不知UI设计师能否帮完善 再画到衣服上哪？', '-1']),
       list(['3510', '154', '2015/5/6', '0', '158O1319OO6', '我感觉就是UI设计师在拼颜值啊~谁看好他就支持他，颜值高支持多，我有点想支持了。。。', '-1']),
       ...,
       list(['60781', '457', '2016/6/22', '0', 'j***g', '啊啊啊啊！从微博上看到就果断来搜了！前排啊！！这时候拼的就是手速了！', '-3']),
       list(['60781', '458', '2016/6/22', '0', 'jd_GOODLI624', '我抢，求抽中[太阳][握手]', '-3']),
       list([''])], dtype=object)

## Put data into dataframe

In [161]:
#convert numpy array to pandas dataframe 
xiahua = pd.DataFrame(data=contents[1:], columns = contents[0])

In [162]:
#renamed columns to English
#I could also have renamed the columns when I first instatiated the dataframe
#There's more than one way to get to the same solution
xiahua.rename(columns={'项目id':'Project ID'
                         , '第几条主评论':'Discussion ID'
                         , '日期':'Date', '是否是回复':'Reply'
                         , '评论者id':'Discussant ID'
                         , '评论文本':'Text'
                         , '众筹前第几天':'Days Before'}, inplace=True)

In [163]:
xiahua.drop([46441], inplace=True) #dropped empty row

In [164]:
xiahua.head()

Unnamed: 0,Project ID,Discussion ID,Date,Reply,Discussant ID,Text,Days Before
0,3510,152,2015/5/6,0,gaodapk,一开始我以为是你们帮设计表白图案那，我想买一件那种把自己设计的图案画到衣服上，就是用丙烯画那...,-1
1,3510,154,2015/5/6,0,158O1319OO6,我感觉就是UI设计师在拼颜值啊~谁看好他就支持他，颜值高支持多，我有点想支持了。。。,-1
2,3510,156,2015/5/5,0,逆战游戏玩家,什么时候开卖[疑问],-2
3,3722,106,2015/4/27,0,jove0916,几点开始啊,-1
4,3722,108,2015/4/27,0,有Bigger,具体什么时间开始呢？,-1


# Basic text pre-processing
### Remove punctuation

In [165]:
xiahua['Text'] = xiahua['Text'].str.replace('[^\w\s]','') #removes punctuation

### Remove English characters

In [166]:
#make all English characters lower case. We do this because Python is case sensitive 
xiahua['Text'] = xiahua['Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [167]:
cleaned = []
for text in xiahua['Text'].iteritems():
    cleaned.append(re.sub('[a-z]', '', text[1]))

In [168]:
xiahua['Text'] = cleaned

### Remove numbers

In [169]:
cleaned = []
for text in xiahua['Text'].iteritems():
    cleaned.append(re.sub('[0-9]', '', text[1]))

In [170]:
xiahua['Text'] = cleaned

### Remove Chinese stop words

In [232]:
#read in chinese stop words into a list
stop_chinese = []
with open('stopwords.txt', encoding='utf8') as f:
    for line in f:
        line = line.strip() 
        stop_chinese.append(line) 
    f.close() 

In [210]:
test = xiahua.copy()

In [181]:
somewords = ['多次下单哈哈', '吼吼吼给很多人看了都说不错快点到号吧要下手啦 哈哈哈哈哈']

In [196]:
seg_list = jieba.cut('多次下单哈哈', cut_all=False)
# print("Full mode: "+"/ ".join(seg_list))

In [233]:
def remove_chinese_stop_words(text_column, stopwords):
    new_col=[]
    for text in text_column.iteritems():
        seg_list = jieba.cut_for_search(text[1])
        final_str = ""
        for seg in seg_list:
            if seg not in stopwords:
                final_str = final_str+seg
        new_col.append(final_str)
    return new_col

In [234]:
new_col = remove_chinese_stop_words(test.Text, stop_chinese)

In [235]:
test['Cleaned_Text'] = new_col

In [238]:
test.loc[test.Cleaned_Text.str.contains('者')]

Unnamed: 0,Project ID,Discussion ID,Date,Reply,Discussant ID,Text,Days Before,Cleaned_Text
48,5339,97,2015/4/19,0,cnhkygmm,电脑可以直接访问云盘无需安装软件这点很赞但不清楚是否可以在电脑上操作将电脑云盘手机三者之间的...,-4,电脑直接访问云盘无需安装软件这点很赞清楚是否电脑操作电脑云盘手机三者之间文件限格式相互随意复...
162,5565,110,2015/4/13,1,sootao,欢迎加群 众多电子烟爱好者的聚集地哦,-2,欢迎加群 众多电子烟爱好好者爱好者聚集聚集地
367,7502,32,2015/4/10,1,jd_风儿伤,亲其实任何品牌的车主都有参加选择支持或反对此次保养众筹的权利虽然品牌不同但是认真选择正确判断...,-4,亲其实品牌车主参加选择支持反对保养众筹权利品牌不同认真选择正确判断消费消费者对众筹项目应有贡...
375,7598,104,2015/4/22,0,以马,靠手杆抛杆都能承受吗能承受我是不是还要单独线要几点零的线更安全些你这东西标价多配备为什么不全...,-5,靠手杆抛杆承受承受是不是单独线要几点零线更安全东西标价配备不全一点理论探测水深蓝牙范围东西确...
495,8019,249,2015/4/20,0,7***m,希望不要让消费者们失望了握手,-3,希望不要消费消费者们失望握手
507,8019,262,2015/4/18,0,jd_╰Esc__·╮坚强,支持者每人发一台体验一下那就好了嘿嘿,-5,支持支持者每人发一台体验一下那就好
876,9054,150,2015/4/16,0,f***5,声道分离度你在逗我吧手机都了本来觉得外形不错可以考虑却不公布和运放型号公布会吓走爱好者要么神...,-4,声道分离度逗我手机本来觉得外形不错考虑公布运放型号公布会走爱好好者爱好者神逻辑渣芯片只会更烧油
979,9188,73,2015/4/12,0,cityhunterxyzvbw,额又是个云产品这个使用按摩椅的人不能自己控制么难道别人比他自己更了解他的需要何必弄个云控制意...,-1,额云产品使用按摩按摩椅不能不能自己控制么难道更了解需要何必弄个云控制意义何在所谓新添加自动程...
1006,9196,296,2015/4/17,0,六月之雨,而最后也是最严重的不足则是太少了感觉产品上市的太早了硬件设计上虽然已经完成但是还远远没有跟上...,-4,最后严重不足太少感觉产品上市太早硬件设计已经完成远远没有跟上说一句开发任重道远分仅凭现在产品...
1062,9214,140,2015/4/16,1,网***6,您好感谢您对酷多啦儿童智能手表的支持在元的支持者中将由京东官方抽取位幸运支持者每人送出酷多啦...,-1,您好感谢感谢您对酷儿童智能手表支持元支持支持者中将京东官方抽取位幸运支持支持者每人送出酷儿童...


In [221]:
new_col = []
for text in test.Text.head().iteritems():
    seg_list = jieba.cut_for_search(text[1])
    final_str = ""
    for seg in seg_list:
        if seg not in stop_chinese:
            final_str = final_str+seg
    new_col.append(final_str)
#     list(text)[1] = final_str
#     tuple(text)
#     print(text)
#     print(list(text)[1])

In [222]:
new_col

['帮设计表白图案想买一件那种设计图案图案画衣服丙烯丙烯画那种初步草图不知设计设计师帮完善 再画衣服',
 '感觉设计设计师拼颜值看好支持颜值高支持有点想支持',
 '开卖疑问',
 '几点',
 '具体时间']

In [198]:
testest = []

In [206]:
str_in = '多次下单哈哈'
seg_list = jieba.cut_for_search(str_in)
final_str = ""
for seg in seg_list:
    if seg not in stop_chinese:
        final_str = final_str+seg
print(final_str)

多次下单
