In [1]:
import json
import csv
import operator
import datetime
import pandas as pd
from graphAPI import*
po.offline.init_notebook_mode()

In [2]:
def load_data(file_name):
    with open('../data/{0}'.format(file_name)) as data_file:
        data = json.load(data_file)
        return data

In [3]:
def load_csv_file(file_name):
    with open('../data/{0}'.format(file_name)) as data_file:
        data = pd.read_csv(data_file, delimiter=',')
        data = data.to_json()
        data = json.loads(data)
        well_formated = {}
        for key, postId in data['postId'].items():
            well_formated[postId] = {}
            well_formated[postId]['wordLevel'] = data['wordLevel'][key]
            well_formated[postId]['videoSpeed'] = data['videoSpeed'][key]
            well_formated[postId]['subtitleLengthRatio'] = data['subtitleLengthRatio'][key]
            well_formated[postId]['sectionLength'] = data['sectionLength'][key]
            well_formated[postId]['wordList'] = data['wordList'][key]
        data = well_formated
        return data

In [4]:
students = load_data('member0_8000_v0125.json')
video_data = load_data('videoInfo_0125_v1.json')
original_video_data = load_csv_file('videoDataInfo.csv')

In [5]:
first_chosen_video = {}
last_chosen_video = {}
for student in students:
    length = len(student['member_info']['chosen_video'])
    if length == 0:
        continue
    first_video = student['member_info']['chosen_video'][0]
    last_video = student['member_info']['chosen_video'][(length - 1)]
    
    if first_video in first_chosen_video:
        first_chosen_video[first_video] += 1
    else:
        first_chosen_video[first_video] = 1
    
    if last_video in last_chosen_video:
        last_chosen_video[last_video] += 1
    else:
        last_chosen_video[last_video] = 1

In [6]:
print (first_chosen_video)

{5186: 31, 10564: 2, 3589: 1, 3913: 7956, 14394: 3, 9770: 3, 4974: 1, 10565: 1}


In [7]:
print (sorted(last_chosen_video.items(), key=operator.itemgetter(1), reverse=True))

[(3913, 2311), (8545, 331), (3711, 314), (9851, 206), (10023, 165), (10250, 160), (7130, 158), (10257, 157), (4220, 144), (6061, 127), (3585, 124), (5186, 115), (10565, 108), (10303, 107), (8456, 102), (10019, 94), (8859, 84), (9771, 83), (4802, 83), (9626, 76), (4974, 69), (9770, 68), (10177, 66), (10026, 61), (8547, 58), (12675, 54), (3649, 50), (9578, 47), (9091, 47), (12218, 47), (10564, 46), (12673, 46), (10457, 45), (4336, 45), (8457, 45), (9769, 39), (5433, 37), (12597, 36), (4561, 36), (8918, 35), (10201, 33), (10074, 32), (10543, 31), (13292, 31), (12603, 28), (11406, 27), (4165, 26), (10308, 25), (5806, 25), (7126, 25), (9625, 25), (5881, 25), (13004, 24), (11863, 23), (10568, 23), (3693, 22), (5895, 22), (11698, 22), (6781, 21), (10535, 21), (8700, 21), (6151, 20), (8717, 20), (50, 20), (10301, 20), (5797, 20), (8917, 20), (10020, 20), (9916, 19), (12671, 19), (10073, 19), (6521, 19), (9097, 19), (12951, 18), (8860, 18), (11621, 18), (2093, 17), (14393, 17), (12908, 17), (13

In [8]:
video_categories = {}
for video in video_data:
    vid = video['postID']
    video_categories[vid] = video['category']

In [9]:
chosen_category = {}
for student in students:
    chosen_video = student['member_info']['chosen_video']
    if len(chosen_video) == 0:
        continue
    for video in chosen_video:
        category_list = video_categories[str(video)]
        for category in category_list:
            if category in chosen_category:
                chosen_category[category] += 1
            else:
                chosen_category[category] = 1

In [10]:
chosen_category['演說大師'] = chosen_category.pop(0)
chosen_category['知識補帖'] = chosen_category.pop(1)
chosen_category['食指大動'] = chosen_category.pop(2)
chosen_category['人際關係'] = chosen_category.pop(3)
chosen_category['心靈雞湯'] = chosen_category.pop(4)
chosen_category['環遊全球'] = chosen_category.pop(5)
chosen_category['職場生存'] = chosen_category.pop(6)
chosen_category['全球脈動'] = chosen_category.pop(7)
chosen_category['生活娛樂'] = chosen_category.pop(8)

In [11]:
print (chosen_category)

{'演說大師': 11400, '全球脈動': 5415, '環遊全球': 5727, '生活娛樂': 23279, '知識補帖': 32720, '食指大動': 6708, '職場生存': 4623, '人際關係': 8245, '心靈雞湯': 5643}


In [12]:
sort_chosen_category = sorted(chosen_category.items(), key=operator.itemgetter(1))

In [13]:
generate_bar_chart(sort_chosen_category, "Category of students' chosen videos")

In [14]:
score_distribution = {}
students_with_score = []
for student in students:
    member_id = student['member_info']['memberid']
    avg_score = student['member_info']['listen_score']['average']
    if avg_score == 0:
        continue
    if avg_score in score_distribution:
        score_distribution[avg_score] += 1
    else:
        score_distribution[avg_score] = 1
    students_with_score.append(member_id)

In [15]:
sort_score_distribution = sorted(score_distribution.items(), key=operator.itemgetter(0))

In [16]:
generate_bar_chart(sort_score_distribution, "Distribution of students' average score")

In [17]:
for student in students:
    if student['member_info']['memberid'] == students_with_score[0]:
        scores = student['member_info']['listen_score']['score_ary']
        print (scores)
        # print (student['member_info']['listen_score']['answer_ary'])
        print (student['member_info']['course_status']['section_info'])
        print (student['member_info']['chosen_video'])



{'1': 94, '14': 83, '3': 98, '7': 100, '12': 99, '5': 94, '0': 96, '2': 96, '9': 99, '11': 95, '6': 98, '13': 99, '8': 91, '10': 92, '4': 93}
[{'localposition': 0, 'end': 7, 'audio_path': 'audio_file/play/speakenglish', 'postid': 3913, 'start': 0, 'section': 0}, {'localposition': 1, 'end': 13, 'audio_path': 'audio_file/play/speakenglish', 'postid': 3913, 'start': 8, 'section': 1}, {'localposition': 2, 'end': 20, 'audio_path': 'audio_file/play/speakenglish', 'postid': 3913, 'start': 14, 'section': 2}, {'localposition': 3, 'end': 26, 'audio_path': 'audio_file/play/speakenglish', 'postid': 3913, 'start': 21, 'section': 3}, {'localposition': 4, 'end': 36, 'audio_path': 'audio_file/play/speakenglish', 'postid': 3913, 'start': 27, 'section': 4}, {'localposition': 0, 'end': 8, 'audio_path': 'audio_file/play/moonfestival', 'postid': 5186, 'start': 0, 'section': 0}, {'localposition': 1, 'end': 17, 'audio_path': 'audio_file/play/moonfestival', 'postid': 5186, 'start': 9, 'section': 1}, {'localpo

In [18]:
dropout_video = {}
num_dropout = 0
for student in students:
    chosen_video = student['member_info']['chosen_video']
    last_login_date = (student['member_info']['last_login_time']).split()[0]
    year = int(last_login_date.split('-')[0])
    month = int(last_login_date.split('-')[1])
    day = int(last_login_date.split('-')[2])
    if year == 0 or datetime.date(year, month, day) < datetime.date(2016, 10, 1):
        num_dropout += 1
        if len(chosen_video) == 0:
            continue
        last_chosen_video = 'video: ' + str(chosen_video[-1])
        if last_chosen_video in dropout_video:
            dropout_video[last_chosen_video] += 1
        else:
            dropout_video[last_chosen_video] = 1

In [19]:
print (num_dropout)

5853


In [20]:
sort_dropout_video = sorted(dropout_video.items(), key=operator.itemgetter(1))

In [21]:
generate_bar_chart(sort_dropout_video, "Students' dropped out videos")

In [28]:
course_length = {}
dropout_course_length = {}
for video in video_data:
    courselen = video['courseLen']
    if courselen in course_length:
        course_length[courselen] += 1
    else:
        course_length[courselen] = 1
        
    str_video = 'video: ' + str(video['postID'])
    if str_video in dropout_video:
        if courselen in dropout_course_length:
            dropout_course_length[courselen] += 1
        else:
            dropout_course_length[courselen] = 1

In [37]:
section_length = {}
dropout_section_length = {}
for video in original_video_data:
    sectionlen = original_video_data[video]['sectionLength']
    if sectionlen in section_length:
        section_length[sectionlen] += 1
    else:
        section_length[sectionlen] = 1
        
    str_video = 'video: ' + str(video)
    if str_video in dropout_video:
        if sectionlen in dropout_section_length:
            dropout_section_length[sectionlen] += 1
        else:
            dropout_section_length[sectionlen] = 1

In [38]:
course_length_distribution = []
sort_course_length = sorted(course_length.items(), key=operator.itemgetter(0))
course_length_distribution.append([sort_course_length, 'total videos'])
sort_dropout_course = sorted(dropout_course_length.items(), key=operator.itemgetter(0))
course_length_distribution.append([sort_dropout_course, 'dropout videos'])

In [39]:
section_length_distribution = []
sort_section_length = sorted(section_length.items(), key=operator.itemgetter(0))
section_length_distribution.append([sort_section_length, 'total videos'])
sort_dropout_section = sorted(dropout_section_length.items(), key=operator.itemgetter(0))
section_length_distribution.append([sort_dropout_section, 'dropout videos'])

In [41]:
generate_grouped_bar(section_length_distribution, "Distribution of videos' section length")

In [25]:
for video in video_data:
    if video['postID'] == '3711':
        print (video)

{'sentence': [{'cht': '\ufeff我愛你。', 'eng': 'I love you.'}, {'cht': '嘿...', 'eng': 'Hey...'}, {'cht': '嗯...在我之前有幾個人？', 'eng': 'Um...how many were there before me?'}, {'cht': '你的意思是什麼？', 'eng': 'What do you mean?'}, {'cht': '像是，在我之前你愛過幾個女孩？', 'eng': 'Like, how many girls did you love before me?'}, {'cht': '愛嗎？五個。在你之前我愛過五個女人。', 'eng': 'Love? Five. I loved five women before you.'}, {'cht': '她們的名字是什麼？', 'eng': 'What were their names?'}, {'cht': '人、事、時、地、原因。', 'eng': 'Who, What, When, Where, Why.'}, {'cht': '你可以和我談談她們嗎？', 'eng': 'Can you tell me about them?'}, {'cht': '我愛的「人」是一個大學時期的女孩。', 'eng': 'Who I loved was a girl from college.'}, {'cht': '我不是和她非常親近，但是有著一些表象和幾次在一個學期之間的互動，你知道的(就像大部分的男生，幻想著一個他們幾乎不認識的女孩)我下了結論，就像一位童話故事的作者一樣。', 'eng': "I wasn't exactly close to her, but with some superficial facts and a few interactions over a semester, you know (like most guys, fantasizing over a girl they barely know), I filled in the blanks like a fairytale author."}, {'cht': '而她在我腦海中所成為的那個人可能超過了真實。', 'en

In [26]:
i = 0
for student in students:
    length = len(student['member_info']['chosen_video'])
    if length == 0:
        continue
    if student['member_info']['chosen_video'][-1] == 3913:
        if len(student['member_info']['question']) == 0:
            continue
        message = student['member_info']['question']
        print (message)
        i += 1

[{'sentenceid': '15', 'message': 'We speak in sound units.<br/>"units"在原音的讀法和老師的朗讀讀法聽起來兩者似乎不同是嗎?我想了解原音的讀法或音標,謝謝～', 'postid': '3913', 'id': '1352', 'type': '發音', 'memberid': '51392', 'reply': [{'message': "unit 的音標為 ['jʊnɪt]，這裡原音和老師朗讀其實是一樣的，只是原音後面 nit 的音很輕，而老師念得比較明顯。只要發音對了，其實不用百分之百一定要跟原音念的一模一樣，最重要的是說話的節奏！", 'memberid': '122', 'createdate': '2014-08-13 08:01:26', 'id': '1353'}], 'createdate': '2014-08-13 07:07:50'}, {'sentenceid': '12', 'message': "You've got to train your ear to hear the different musical notes  as well as the  beats, the rhythms.<br/>請問此句的ear為什麼用單數？而不是用ears呢？", 'postid': '3913', 'id': '1351', 'type': '用法', 'memberid': '51392', 'reply': [{'message': '這裡用 ear 或是 ears 其實都可以，因為這裡是沒有稿子、比較口語的敘述，有些小地方就比較隨性，沒那麼講究。', 'memberid': '122', 'createdate': '2014-08-13 08:03:41', 'id': '1354'}], 'createdate': '2014-08-13 06:46:03'}, {'sentenceid': '21', 'message': 'The only way you can  develop an ear is to listen to the way in which native speakers of any language are speaking, and be