In [168]:
import jieba
import json
import random
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [48]:
from elasticsearch import Elasticsearch
import elasticsearch.helpers
from datetime import datetime

In [49]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [50]:
# Find top10
res = es.search(index='facebook', doc_type='user', body= {'query': {'match': {'gender': 'female'}}})
print("%d documents found" % res['hits']['total'])

58 documents found


In [51]:
# Take all result
users = list(elasticsearch.helpers.scan(es, index='facebook', doc_type='user'))
total_user = [(user['_id'], user['_source'].get('gender')) for user in users]
print(len(total_user))

143


In [132]:
no_message = ['1618047998214350', '1624029027647185', '1454298984658260']
foreigners = ['1695358370488750', '10213401737335928', '1532260126834371', '10154597524691479', '10155404471078548', '10158882655745234']

In [142]:
user_dict = {'male': [], 'female': []}
for user in total_user:
    if user[0] not in (no_message + foreigners):
        if user[1] == 'male':
            user_dict['male'].append(user[0])
        else:
            user_dict['female'].append(user[0])

In [143]:
print('male: ' + str(len(user_dict['male'])) + '\nfemale: ' + str(len(user_dict['female'])))

male: 80
female: 54


In [144]:
# Mapping user posts
def get_posts(users):
    posts_dic = {}
    for user in users:
        # print(user)
        posts = list(elasticsearch.helpers.scan(es, query={'query': {'match': {'user_id': user}}}, index='facebook', doc_type='post'))
        message = [post['_source'].get('message') for post in posts if post['_source'].get('message') != '']
        #print(message)
        posts_dic[user] = message
    return posts_dic

In [145]:
male_posts_dict = get_posts(user_dict['male'])
female_posts_dict = get_posts(user_dict['female'])

In [146]:
# Check if content is Chinese or not
def is_Chinese(uchar):
    if uchar >= u'\u4e00' and uchar<=u'\u9fa5':  
        return True  
    else:  
        return False

In [152]:
# Using jieba to tokenize user posts
def tokenize_posts(user_posts):
    seg_posts = {}
    for key, messages in user_posts.items():
        seg_posts[key] = [" ".join(jieba.cut(message, cut_all = False)) for message in messages]
    return seg_posts

In [155]:
male_seg_posts = tokenize_posts(male_posts_dict)
female_seg_posts = tokenize_posts(female_posts_dict)

In [204]:
corpus = [segment for key, segments in female_seg_posts.items() for segment in segments]

In [192]:
def display_scores(vectorizer, tfidf_result):
    # http://stackoverflow.com/questions/16078015/
    scores = zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    index_scores = []
    for index, item in enumerate(sorted_scores):
        index_scores.append((index, item[1]))
    return sorted_scores

In [None]:
def calculate_tfidf(seg_posts):
    corpus += segments for key, segments in seg_posts.items()
    tfidf_vectorizer = TfidfVectorizer()
    tfidf = tfidf_vectorizer.fit_transform(corpus)
    tfidf_features = tfidf_vectorizer.get_feature_names()
    
    sorted_scores = display_scores(tfidf_vectorizer, tfidf)
    return sorted_scores

In [199]:
bag_of_words = [word[0] for word in sorted_scores[:198]]

In [200]:
bag_of_words

['真的',
 '今天',
 '可以',
 '自己',
 '我們',
 '大家',
 '什麼',
 '一個',
 '還是',
 'com',
 '怎麼',
 '知道',
 '就是',
 '不要',
 'http',
 '哈哈',
 '謝謝',
 '覺得',
 '一起',
 '哈哈哈',
 '不是',
 '這麼',
 'xd',
 '開心',
 '這樣',
 '時候',
 '一直',
 '看到',
 '這個',
 '一下',
 'www',
 '明天',
 '希望',
 '感覺',
 '有人',
 '開始',
 '因為',
 '快樂',
 '好像',
 '喜歡',
 '好吃',
 '朋友',
 '沒有',
 '還有',
 '雖然',
 '有點',
 '現在',
 '時間',
 '所以',
 '很多',
 '回家',
 '一樣',
 '感謝',
 '但是',
 '想要',
 '我要',
 'plurk',
 '剛剛',
 '一定',
 '如果',
 '晚餐',
 '加油',
 '最近',
 '發現',
 '不會',
 '其實',
 '天氣',
 '分享',
 'youtube',
 '需要',
 '不能',
 '只是',
 '原來',
 '好多',
 '可是',
 '不過',
 'watch',
 '台灣',
 '好久',
 '東西',
 'the',
 '第一次',
 '好好',
 'https',
 '生活',
 'to',
 '一天',
 '老師',
 '他們',
 '人生',
 '已經',
 '真是',
 '只有',
 'xdd',
 '到底',
 '生日',
 '這是',
 'qq',
 'you',
 '小孩',
 '一點',
 'xddd',
 '應該',
 '你們',
 '那麼',
 '回來',
 '竟然',
 '世界',
 '一次',
 '不想',
 '照片',
 '結果',
 '台北',
 '一年',
 '努力',
 '幸福',
 '畢業',
 '事情',
 '10',
 '心情',
 '突然',
 '想到',
 '媽媽',
 '每天',
 'ya',
 '出來',
 '可能',
 '一種',
 '重要',
 '地方',
 'my',
 '期待',
 '昨天',
 '每次',
 '準備',
 '看電影',
 '午餐',
 '小心',
 '晚上',
 