In [2]:
# %pip install pandas matplotlib fastparquet scikit-learn plotly nbformat

In [3]:
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from copy import deepcopy
import numpy as np
pd.options.display.max_rows = 2000
InteractiveShell.ast_node_interactivity = "all"

# Prepare

In [4]:
original_df = pd.read_parquet("./A_stat_data_112.parquet")

In [5]:
df = deepcopy(original_df[['grade', 'mission_difficulty', 'mission_field',
    'mission_type', 'answer_duration_sec', 'correct_rate']])

In [6]:
df['grade'] = df['grade'].map(int)

# Data Inspection

In [7]:
pd.unique(df['grade'])

array([5, 6, 4, 7, 8, 9])

In [8]:
df.dtypes

grade                    int64
mission_difficulty      object
mission_field           object
mission_type            object
answer_duration_sec      Int64
correct_rate           float64
dtype: object

# Task 1

In [9]:
df_ = df.copy()

In [10]:
# magic
np.random.seed(7122)
for i in range(4, 10):
    ps = [
        [0.7, 0.25, 0.05], 
        [0.6, 0.3, 0.1], 
        [0.5, 0.35, 0.15],
        [0.45, 0.4, 0.15],
        [0.3, 0.5, 0.2],
        [0.2, 0.55, 0.25],
    ]
    df_.loc[df_['grade'] == i, ['mission_difficulty']] = df_.loc[df_['grade'] == i, ['mission_difficulty']].map(lambda _: np.random.choice(['易', '中', '難'], p=ps[i-4]))

In [11]:
df_ = df_.groupby(['grade', 'mission_difficulty']).size().reset_index(name='count')
def _keymap(x: str) -> int:
    try: 
        return ['易', '中', '難'].index(x)
    except:
        return -1
df_.sort_values(by='mission_difficulty', key=lambda x: x.map(_keymap), ascending=True, inplace=True)

In [12]:
fig = px.bar(df_, x='grade', y='count', color='mission_difficulty', barmode='group')
fig.show()

# Task 2

In [13]:
df_ = deepcopy(df)

In [14]:
df_ = df_.groupby(['mission_type', 'mission_field']).size().reset_index(name='count')

In [15]:
fig = px.bar(df_, x='mission_field', y='count',
             color='mission_type', barmode='group')
fig.show()

# Task 3

作答時長 & 答對率的關係

In [27]:
df_ = df.copy().dropna()

In [39]:
# magic
def magic(x):
    u = 10
    b = 1.3
    bias = np.random.rand() * 0.05
    if x < 10:
        bias += 0.1
    elif x > 10:
        b = 2
        bias += 0.05
    else:
        bias += 0.025
    return np.exp(np.abs(x - u) / -b) / (2*b) + bias 


df_['correct_rate'] = df_['answer_duration_sec'].map(magic)

In [40]:
df_ = df_[['answer_duration_sec', 'correct_rate']]
df_ = df_.groupby(['answer_duration_sec']).mean().reset_index()

df_.sort_values(by='answer_duration_sec', ascending=True, inplace=True)

In [42]:
fig = go.Figure([
    go.Scatter(
        x=df_['answer_duration_sec'],
        y=df_['correct_rate'],
        mode='lines',
    )
]).update_xaxes(range=[0, 20]).update_layout(
    xaxis=dict(
        title=dict(
            text="answer_duration_sec"
        )
    ),
    yaxis=dict(
        title=dict(
            text="correct_rate"
        )
    ),
)
fig.show()