In [446]:
import re
import jieba
import jieba.analyse
import json
import random
import operator
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
from elasticsearch import Elasticsearch
import elasticsearch.helpers
from datetime import datetime

In [3]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [4]:
# Find top10
res = es.search(index='facebook', doc_type='user', body= {'query': {'match': {'gender': 'female'}}})
print("%d documents found" % res['hits']['total'])

58 documents found


In [5]:
# Take all result
users = list(elasticsearch.helpers.scan(es, index='facebook', doc_type='user'))
total_users = [user['_id'] for user in users]
total_user_with_gender = [(user['_id'], user['_source'].get('gender')) for user in users]
print(len(total_user_with_gender))

143


In [6]:
no_message = ['1618047998214350', '1624029027647185', '1454298984658260', '1805190672831590']
foreigners = ['1695358370488750', '10213401737335928', '1532260126834371', '10154597524691479', '10155404471078548', '10158882655745234']

In [7]:
total_users = [user for user in total_users if user not in (no_message + foreigners)]
print(len(total_users))

133


In [8]:
user_dict = {'male': [], 'female': []}
for user in total_user_with_gender:
    if user[0] not in (no_message + foreigners):
        if user[1] == 'male':
            user_dict['male'].append(user[0])
        else:
            user_dict['female'].append(user[0])

In [9]:
print('male: ' + str(len(user_dict['male'])) + '\nfemale: ' + str(len(user_dict['female'])))

male: 79
female: 54


In [10]:
# Filtering urls
def filtering_url(messages):
    filtered_messages = [re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', message.replace('\n', '').replace('\r', '')) for message in messages]
    return filtered_messages

In [11]:
# Mapping user posts
def get_posts(users):
    posts_dic = {}
    for user in users:
        # print(user)
        posts = list(elasticsearch.helpers.scan(es, query={'query': {'match': {'user_id': user}}}, index='facebook', doc_type='post'))
        messages = [post['_source'].get('message') for post in posts if post['_source'].get('message') != '']
        #print(message)
        filtered_messages = filtering_url(messages)
        posts_dic[user] = filtered_messages
    return posts_dic

In [12]:
total_posts = get_posts(total_users)
male_posts_dict = get_posts(user_dict['male'])
female_posts_dict = get_posts(user_dict['female'])

# Using jieba as Chinese tokenizer, and calculating TFIDF

In [13]:
# Using jieba to tokenize user posts
def tokenize_posts(user_posts):
    seg_posts = {}
    for key, messages in user_posts.items():
        seg_posts[key] = [' '.join(jieba.cut(message, cut_all = False)) for message in messages]
    return seg_posts

In [14]:
male_seg_posts = tokenize_posts(male_posts_dict)
female_seg_posts = tokenize_posts(female_posts_dict)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.695 seconds.
Prefix dict has been built succesfully.


In [15]:
# Display tfidf score
def display_scores(vectorizer, tfidf_result):
    # http://stackoverflow.com/questions/16078015/
    scores = zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    index_scores = []
    for index, item in enumerate(sorted_scores):
        index_scores.append((index, item[1]))
    return sorted_scores

In [16]:
def calculate_tfidf(seg_posts):
    corpus = [segment for key, segments in seg_posts.items() for segment in segments]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf = tfidf_vectorizer.fit_transform(corpus)
    tfidf_features = tfidf_vectorizer.get_feature_names()
    
    sorted_scores = display_scores(tfidf_vectorizer, tfidf)
    return sorted_scores

In [17]:
male_tfidf_scores = calculate_tfidf(male_seg_posts)
female_tfidf_scores = calculate_tfidf(female_seg_posts)

In [18]:
male_bag_of_words = [word[0] for word in male_tfidf_scores[:100]]
female_bag_of_words = [word[0] for word in female_tfidf_scores[:100]]

# Graph-based word categorization

In [19]:
%%time
word_index = {}
index = 1

#Building word index dictionary
for key, messages in total_posts.items():
    for message in messages:
        for word in list(message):
            if word not in word_index:
                word_index[word] = index
                index += 1

CPU times: user 544 ms, sys: 0 ns, total: 544 ms
Wall time: 555 ms


In [20]:
def building_weighted_graph(word_graph):
    weighted_word_graph = {}
    # Finding the maximun value of word frequency
    max_freq = max(word_graph.items(), key=operator.itemgetter(1))[1]

    for key, value in word_graph.items():
        weight = value / max_freq
        weighted_word_graph[key] = weight 
    return weighted_word_graph

In [21]:
def building_word_graph(user_posts):
    word_graph = {}
    for key, messages in user_posts.items():
        for message in messages:
            words = list(message)
            
            for i in range(len(words)-1):
                if (words[i], words[i+1]) not in word_graph:
                    word_graph[(words[i], words[i+1])] = 1
                else:
                    word_graph[(words[i], words[i+1])] += 1
    weighted_word_graph = building_weighted_graph(word_graph)
    return weighted_word_graph

In [22]:
male_weighted_graph = building_word_graph(male_posts_dict)
female_weighted_graph = building_word_graph(female_posts_dict)

In [23]:
import networkx as nx
import plotly.plotly as py
from plotly.graph_objs import *

In [24]:
G = nx.random_geometric_graph(200, 1)
# position is stored as node attribute data for random_geometric_graph
pos = nx.get_node_attributes(G, 'pos')

In [25]:
edge_trace = Scatter(
    x=[], 
    y=[], 
    line=Line(width=0.5,color='#888'),
    hoverinfo='none',
    mode='lines')

In [26]:
for key, messages in male_posts_dict.items():
    for message in messages:
        words = list(message)
        if '.' in words:
            print(message)
            #print(words)
            break

不要~~~很痛><....................標珩彤
Boo要這樣Brook Lopez QQ我還沒去看你打球...謝謝你陪籃網走過這些年儘管戰績不佳，歷經搬家也搭配過Vince Carter Delvin Harris Deron Williams到現在的Lin真的很喜歡你的投籃還有打板中距離以及上個球季練成的三分線去湖人加油籃網的1號位該由Lin還是DLO?雖然很喜歡林書豪但有那麼有天分的一號位或許第六人更適合林一大早就很不捨QQ我愛大羅Let's Go Nets!
多學多聞多問成功三要件·...........放屁啦，每次問個問題就崩潰我是要問什麼鬼學什麼毛啊⋯⋯
今年終於如願加入了活動組或許是種羨慕吧還記得當初問饅頭:妳會不會後悔找我加入活動組饅頭說:不會阿 這給我很大的鼓勵 很怕自己做不好其實謝謝柯柯 我知道你沒有我帥 可是你也不要氣餒 動組有你CARRY真的很棒 還記得有一天你在幫忙做火球 在那邊鬧脾氣 真的很好笑 很低能 辛苦了 上樑不正 這很難不去聯想就是說吼....謝謝饅頭 謝謝妳當初找我進動組 希望沒有讓妳後悔 動組有妳CARRY真的很讚 雖然小潘很變態 在動組大家的心目中妳根本是女神 真的很謝謝妳跟ㄎㄎ的包容與體貼 沒有你們 就沒有我們動組謝謝我的PARTNER馬小 真的很高興很跟你分在一組 太神拉 這是命運的安排 還記得一開始載你去發傳單 你就把手插進去我的口袋 其實還蠻害羞的哈哈 你真的太有才了 火球.賭場.大地各種CARRY 沒有你我都不知道該怎辦了 真得很感謝<3謝謝動組的夥伴 每天雖然充斥著髒話.變態的話 可是我很愛這種相處模式 大家一起忙一起瘋一起嘿嘿一起當農奴一起完成這次電資 大家真的超猛 舞.戲.大地.火線.賭場.RPG.夜教.主持等等.....真的不能不說我們是地表最強的活動組團隊  這次電資營讓我覺得最慶幸的是我認識了你們 有你們的的活動組才是活動組 謝謝你們讓我找回了自己最終也是最初的感動 以後一定要常常吃飯.聊天.內戰 愛你們謝謝2014電資營MISt strEEt全體工人 雖然過程中沒有盡善盡美 可是我們都努力過了 我們一起征服了 這樣就已經足夠了 YA有好多好多話想說 是我不夠體貼 總之辛苦了 嗯嗯哈哈嘻嘻嘿嘿亨亨嘖嘖蹦蹦QQㄜㄜ嗚嗚七槍七槍電資營讓我覺得最慶幸的是真的真的我認識了動組的你們
明年上班..

# Topic Modeling

In [396]:
with open('./extra_dictionary/stop_words.txt','r') as file:
    stopwords = [line.rstrip('\n') for line in file]

In [397]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [398]:
def stem_words(sentence):
    stemmed_term_xd = re.sub(r'(x|X)+(D|d)+', 'xd', sentence)
    stemmed_term_qq = re.sub(r'(q|Q)+(q|Q)+', 'qq', stemmed_term_xd)
    stemmed_digit = re.sub(r'([0-9])([0-9])*', '', stemmed_term_qq)
    words = list(jieba.cut(stemmed_digit, cut_all = False))
    stemmed_sentence = ' '.join([word for word in words if word not in stopwords])
    return stemmed_sentence

In [399]:
matched_id = []
corpus = []
for key, messages in total_posts.items():
    for message in messages:
        matched_id.append(key)
        stemmed_message = stem_words(message)
        corpus.append(stemmed_message)

In [400]:
print(len(matched_id))
print(len(corpus))

84497
84497


In [401]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(corpus)

In [402]:
tfidf_sorted_scores = display_scores(tfidf_vectorizer, tfidf)

In [403]:
bag_of_words = [word[0] for word in tfidf_sorted_scores[:200]]

In [404]:
print(bag_of_words)

['xd', '真的', '今天', '可以', '大家', '一個', '還是', '覺得', '知道', '就是', '明天', '這樣', 'qq', '哈哈', '看到', '不要', '不是', '這麼', '一直', '時候', '一下', '好像', '感覺', '一起', '有人', '哈哈哈', '希望', '開始', '謝謝', '有點', '沒有', '現在', '還有', '喜歡', '不過', '加油', '原來', '時間', '開心', '剛剛', '到底', '一樣', '快樂', '發現', '朋友', '如果', '一定', '台灣', '不會', '回家', '分享', '好吃', '很多', '不能', '應該', '好多', '結果', '晚安', '我要', '可是', '事情', '東西', '這是', '其實', '最近', '台北', '感謝', '正在', '一天', '已經', '只是', '一次', '根本', '第一次', '世界', '想要', '睡覺', '真是', '準備', '好久', '一點', '出來', '那麼', '只有', '突然', '人生', '需要', '晚餐', '回來', '天氣', '晚上', '這種', '生活', '好好', '想到', '老師', '胡牌', '他們', '終於', '生日', '照片', '參加', '問題', '畢業', '起來', '活動', '可能', '整個', '不想', '啊啊啊', '結束', '昨天', '完全', '地方', '繼續', '遇到', '不用', '一年', '不到', '好奇', '是不是', '兩個', '只能', '今年', '竟然', '心情', '早上', '努力', '果然', '每次', '第一', '工作', '看看', 'ya', '大概', '各位', '一堆', '每天', '期待', '可怕', '一種', '影片', '宿舍', '手機', '下雨', 'orz', '這次', '早安', '早餐', '一些', '以前', '厲害', '我覺', '小心', '哈哈哈哈', '非常', '電腦', '回到', '學校', '有趣', '午餐', '不錯', '怎樣', 'go', '不見', '小

In [405]:
tf_vectorizer = CountVectorizer()
tf = tf_vectorizer.fit_transform(corpus)

In [406]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [407]:
lda = LatentDirichletAllocation(n_topics=4, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [408]:
lda.fit(tf)


n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=4, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [409]:
distribution = lda.fit_transform(tf)


n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21



In [410]:
print(distribution)

[[ 0.25        0.25        0.25        0.25      ]
 [ 0.02778917  0.02780652  0.91659435  0.02780995]
 [ 0.03648737  0.03571534  0.8920818   0.03571549]
 ..., 
 [ 0.02130263  0.02083519  0.52798044  0.42988173]
 [ 0.23870814  0.00642995  0.74845113  0.00641078]
 [ 0.60778559  0.0625093   0.26719522  0.06250988]]


In [411]:
tf_feature_names = tf_vectorizer.get_feature_names()

In [412]:
print_top_words(lda, tf_feature_names, 20)

Topic #0:
xd 今天 真的 大家 明天 qq 可以 謝謝 好像 還有 加油 一個 看到 還是 哈哈 一起 不過 好多 開心 開始
Topic #1:
ahq 圖靈機 my 哈哈哈 啊啊啊 幹幹 正在 go so be 程式 with day 呵呵 ya all we tpa lol by
Topic #2:
可以 真的 一個 知道 就是 還是 不是 時候 不要 覺得 一直 這樣 這麼 大家 今天 如果 沒有 只是 很多 事情
Topic #3:
參加 活動 好久 一起 神魔 鎖鎖鎖 __ 作業 快來 ___ 儒佳 打卡 網址 問卷 不見 飲料 gg de 分享 lab



In [438]:
i = 0
user_topic_distribution = defaultdict(lambda:defaultdict(list))
for values in distribution:
    user_id = matched_id[i]
    # print(user_id)
    maximum = max(values)
    if maximum != 0.25:
        index = list(values).index(max(values))
        user_topic_distribution[user_id][index].append(maximum)
    i += 1

In [501]:
user_topic_count = defaultdict(lambda:defaultdict(lambda:0))
for key, items in user_topic_distribution.items():
    if key not in user_topic_count:
        user_topic_count[key][0] = 0
        user_topic_count[key][1] = 0
        user_topic_count[key][2] = 0
        user_topic_count[key][3] = 0
    for topic, values in items.items():
        user_topic_count[key][topic] = len(values)

In [512]:
user_topic_table = [['id', 'topic0_count', 'topic1_count', 'topic2_count', 'topic3_count']]
for key, items in user_topic_count.items():
    scores = [key]
    for i in range(4):
        scores.append(user_topic_count[key][i])
    user_topic_table.append(scores)

In [513]:
headers = user_topic_table.pop(0)
df = pd.DataFrame(user_topic_table, columns=headers)
df

Unnamed: 0,id,topic0_count,topic1_count,topic2_count,topic3_count
0,1822107337815837,6,1,10,4
1,1808122152538948,414,99,123,50
2,1525066240878271,63,30,45,17
3,10210988536529598,25,43,41,28
4,1386097208093454,311,90,413,86
5,1896221243728018,43,46,58,49
6,1634011049952111,33,27,47,15
7,1513944438661823,53,29,59,17
8,1840120512668185,151,64,419,30
9,1721752347837052,35,7,39,14
