In [None]:
import os,json
import pandas as pd
from datetime import datetime
from dateutil.parser import parse
import numpy as np
import jieba
import jieba.analyse
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='whitegrid', color_codes=True)


In [None]:
### load files from all folders
def list_dirs(dir):                                                                                                  
    r = []                                                                                                            
    subdirs = [x[0] for x in os.walk(dir)]                                                                            
    for subdir in subdirs:                                                                                            
        files = os.walk(subdir).next()[2]                                                                             
        if (len(files) > 1):                                                                                          
            for file in files:                                                                                        
                r.append(subdir + '/'+ file)                                                                         
    return r 
path_comments = list_dirs('weibo/comments/')
path_reposts = list_dirs('weibo/reposts/')
path_statuses = list_dirs('weibo/statuses/')
comment_files = [pos_json for pos_json in path_comments if pos_json.endswith('.json')]
reposts_files = [pos_json for pos_json in path_reposts if pos_json.endswith('.json')]
statuses_files = [pos_json for pos_json in path_statuses if pos_json.endswith('.json')]



def make_df(from_files):
    df = pd.DataFrame(columns = ['id','gender', 'time','location', 'text'])
    for index, json_file in enumerate(from_files):
        json_data = open(json_file)
        text = json.load(json_data)
        user_id = text['user']['id']
        user_gender = text['user']['gender']
        time = parse(text['created_at'])
        if text['user']['location'][0:2] == u'\u6d77\u5916':
            user_loc = text['user']['location'][3:]
        else:    
            user_loc = text['user']['location'][0:2]
        try:
            user_text = text['retweeted_status']['text']
        except:    
            user_text = text['text']
        df.loc[index] = [user_id, user_gender, time, user_loc, user_text]
    return df

print len(comment_files),len(reposts_files),len(statuses_files)


In [None]:
### create dataframes and combine for analysis

df_comments = make_df(comment_files)
df_reposts = make_df(reposts_files)
df_statuses = make_df(statuses_files)

df_comments['type'],df_reposts['type'],df_statuses['type'] = 'comment','repost','statuse'

df = df_comments.append([df_reposts,df_statuses])

In [None]:

df.text.str.lower().str.contains('katespades').sum()

In [None]:
df_mk,df_ks = df[df.text.str.lower().str.contains('michael kors|michaelkors|micheal kors|michealkors|mk')],df[df.text.str.lower().str.contains('kate spade|katespade|kate spades|katespades|ks')]



In [None]:
### total number of posts and unique users mentioning Michael Kors
print 'There are', len(df_mk) , 'posts and' , df_mk['id'].nunique() , 'users mentioning Michael Kors'

In [None]:
### total number of posts and unique users mentioning Kate Spade
print 'There are', len(df_ks) , 'posts and' , df_ks['id'].nunique() , 'users mentioning Kate Spade'

In [None]:
### top 10 users for total posts
df.groupby('id').size().nlargest(10)

In [None]:
### top 10 locations for total posts
df[df.location!=''].groupby('location').size().nlargest(10)

In [None]:
### top 10 users with Michael Kors mentions
df_mk.groupby('id').size().nlargest(10)

In [None]:
### top 10 locations with Michael Kors mentions
df_mk[df_mk.location!=''].groupby('location').size().nlargest(10)

In [None]:
### top 10 users with Kate Spade mentions
df_ks.groupby('id').size().nlargest(10)

In [None]:
### top 10 locations with Kate Spade mentions
df_ks[df_ks.location!=''].groupby('location').size().nlargest(10)

In [None]:
### create time series data frames
df_mk.index, df_ks.index = df_mk['time'], df_ks['time']
df_mk['date'], df_mk['hour'], df_mk['brand'] = df_mk.index.date, df_mk.index.hour, 'Michael Kors'
df_ks['date'], df_ks['hour'], df_ks['brand'] = df_ks.index.date, df_ks.index.hour, 'Kate Spade'

In [None]:
### top mention date for each brand
print df_mk.groupby('date').size().nlargest(1), df_ks.groupby('date').size().nlargest(1)

In [None]:
df_mk_type,df_ks_type = df_mk[df_mk['type']!= 'statuse'], df_ks[df_ks['type']!= 'statuse']

df_ts_alltype, df_ts = df_mk.append(df_ks), df_mk_type.append(df_ks_type)


In [None]:
### peak hour for total post
df_ts_alltype.groupby('hour').size().nlargest(1)

In [None]:
### peak hour for each brand
print df_mk.groupby('hour').size().nlargest(1), df_ks.groupby('hour').size().nlargest(1)

In [None]:
### popular words associated with Michael Kors
words_mk = u''
for word in df_mk['text'].str.lower():
    words_mk = words_mk + word
    
top_word_mk = jieba.analyse.extract_tags(words_mk, topK=20, withWeight=False, allowPOS=())
pd.DataFrame(top_word_mk).T

In [None]:
### popular words associated with Kate Spade
words_ks = u''
for word in df_ks['text'].str.lower():
    words_ks = words_ks + word
    
top_word_ks = jieba.analyse.extract_tags(words_ks, topK=20, withWeight=False, allowPOS=())
pd.DataFrame(top_word_ks).T

In [None]:
### Michael Kors related comments and reposts count per day 
ts_count_mk = df_mk_type.groupby(['date','type'])['text'].count()
ts_count_mk.to_csv('daily_count_mk.csv')
ts_count_mk.head(10)

In [None]:
### Kate Spade related comments and reposts count per day 
ts_count_ks = df_ks_type.groupby(['date','type'])['text'].count()
ts_count_ks.to_csv('daily_count_ks.csv')
ts_count_ks.head(10)

In [None]:
### plot total converstation for each brand over the entire time frame
g = sns.factorplot(x = 'date', hue = 'type', col = 'brand', data =df_ts, size = 8, kind = 'count')
g.set_xticklabels(df_ts.date.unique(),rotation = 90)

In [None]:
### explore user bias -- gender
df.groupby('gender')['id'].nunique()

In [None]:
daily_post_count = df_ts['text'].groupby([df_ts.date,df_ts.gender]).size().unstack()
daily_posts.head()

In [None]:
plt.title('Number of Posts, Averaged Daily', fontsize = 18)
sns.boxplot(data = daily_posts)
sns.stripplot(data = daily_posts,jitter=True)

In [None]:

g = sns.factorplot(x = 'date', hue = 'gender', col = 'brand', data =df_ts, size = 10, kind = 'count')
g.set_xticklabels(df_ts.date.unique(),rotation = 90)

In [None]:
g = sns.factorplot(x = 'gender', col = 'brand', data =df_ts, size = 10, kind = 'count')

In [None]:
g = sns.factorplot(x = 'brand', col = 'gender', data =df_ts, size = 10, kind = 'count')

In [None]:
### Michael Kors related comments and reposts count per day 
gender_count = df_ts_alltype.groupby(['gender','type','brand'])['text'].count()
gender_count

In [None]:
### Not all users care about both brands -- a fair comparison should be based on users who posted about both brands

user = df_ts_alltype.groupby(['id','type','brand'])['text'].count()
user.head(10)

In [None]:
### use the ratio of each users unqiue post on Michael Kors over the number of unqiue post on Kate Spade 
selected_users = pd.merge(df_mk, df_ks, how = 'inner', on = 'id')
(selected_users.groupby('id')['time_x'].nunique()/selected_users.groupby('id')['time_y'].nunique()).nlargest(10)