### Import necessary libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import re
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots    

### Data loading and exploration

In [2]:
path = os.getcwd()
print(path)

In [3]:
products_df = pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
districts_df = pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")

districts_df.shape, products_df.shape

In [4]:
products_df.head()

In [5]:
# Checking the file names in the engagement_data folder
engagement_df= os.listdir("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data")
engagement_df[0:10]

In [6]:
# Checking the file named '1000.csv' as a sample
engagement_1000_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/1000.csv")
engagement_1000_df.head()

## 🏷Preprocessing
* dropping 57 school districts with NaN states (57/233 ~ 25%)
* Listing the products name list in 2020 according to the sector
* one-hot encoding the product sectors
* splitting up the primary essential function into main and sub category
* Adding 'district_id' colum in the engagement_data file
* Checking the data by date.

### 📣dropping 57 school districts with NaN states (57/233 ~ 25%)

In [7]:
# Missing Value check of district_df 
districts_df.isna().sum()

In [8]:
# Original state value
print(districts_df.shape)
districts_df = districts_df[districts_df.state.notna()].reset_index(drop=True)
# Deleting Nan value in the 'state' column
print(districts_df.shape)

### 📣Listing the products name list in 2020 according to the sector
* I think PreK-12 & PreK-12; Higher Ed; Corporate is important.

In [9]:
# Finding a kind of data in the 'Sector' column
products_df['Sector(s)'].unique()

In [10]:
products_df['Sector(s)'].value_counts()

In [11]:
# PreK-12
prek12_df = products_df[products_df['Sector(s)'].isin(['PreK-12'])]

prek12_df['Product Name'].unique()

In [12]:
# PreK-12; Higher Ed; Corporate
prek12_higher_cor_df = products_df[products_df['Sector(s)'].isin(['PreK-12; Higher Ed; Corporate'])]
prek12_higher_cor_df['Product Name'].unique()

### 📣one-hot encoding the product sectors

In [13]:
temp_sectors = products_df['Sector(s)'].str.get_dummies(sep="; ") 
temp_sectors.head()

In [14]:
temp_sectors.columns = [f"sector_{re.sub(' ', '', c)}" for c in temp_sectors.columns]  

In [15]:
temp_sectors.columns

In [16]:
products_df = products_df.join(temp_sectors)
products_df.head()

In [17]:
products_df.drop("Sector(s)", axis=1, inplace=True)
print(products_df.columns)
del temp_sectors

### 📣Dividing Primary Essential Function as main/sub function 

In [18]:
products_df['pri_function_main'] = products_df['Primary Essential Function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
products_df['pri_function_sub'] = products_df['Primary Essential Function'].apply(lambda x: x.split(' - ')[1] if x == x else x)

In [19]:
products_df.head()

In [20]:
# Synchronize similar values
products_df['pri_function_sub'] = products_df['pri_function_sub'].replace(
          {'Sites, Resources & References' : 'Sites, Resources & Reference'})
products_df.drop("Primary Essential Function", axis=1, inplace=True)

In [21]:
products_df['pri_function_sub'].unique()

In [22]:
products_df[ ['sector_Corporate', 'sector_HigherEd', 'sector_PreK-12',
       'pri_function_main', 'pri_function_sub'] ]

### 📣Adding district_id column into 'engagement_data' file

In [23]:
districts_df.district_id.unique()

In [24]:
PATH = "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data"

temp = []

for district in districts_df.district_id.unique():
    df = pd.read_csv(f'{PATH}/{district}.csv', index_col=None, header=0)
    df['district_id'] = district
    temp.append(df)

len(temp)

In [25]:

engagement = pd.concat(temp)
engagement = engagement.reset_index(drop=True)
engagement.head()


## 🏷EDA(Exploratory Data Analysis)

### 📣Checking the data by date.

* There are 366 days in 2020.
* However, 43 districs contains the data less than 366 days.
* ex)district_id 3670, we can use data between 2020-02-15  to 2020-03-02.
* ex) district_id 2872, we can use data in January 2020 and February 1st, March 1st (2days).

In [26]:
len(engagement.district_id.unique())

In [27]:
engagement.info()

In [28]:
# Randomly selecting any district(3670) and checking how many 'time' data.
engagement[engagement['district_id']==3670].time.unique()

### ✔The available date (366 days) in each 'district_id' is expressed as a histogram.

In [29]:
fig, ax = plt.subplots(1, 1, figsize=(8,4))

sns.histplot(engagement.groupby('district_id').time.nunique(), bins=30)
ax.set_title('Unique Days of Engagement Data per District')
plt.show()

In [30]:
del engagement

temp = []

for district in districts_df.district_id.unique():
    df = pd.read_csv(f'{PATH}/{district}.csv', index_col=None, header=0)
    df["district_id"] = district
    if df.time.nunique() == 366:    # 366일인 것만 추가하겠다. 
        temp.append(df)

engagement = pd.concat(temp)   # 366일인것만 뽑아서 pandas로 찍기 
engagement = engagement.reset_index(drop=True)

In [31]:
districts_df.shape, products_df.shape

In [32]:
fig, ax = plt.subplots(1, 1, figsize=(8,4))

sns.histplot(engagement.groupby('district_id').time.nunique(), bins=30)
ax.set_title('Unique Days of Engagement Data per District')
plt.show()

### ✔Only data that has all 366 days of data is combined.

In [33]:

districts_df = districts_df[districts_df.district_id.isin(engagement.district_id.unique())].reset_index(drop=True)
products_df = products_df[products_df['LP ID'].isin(engagement.lp_id.unique())].reset_index(drop=True)

In [34]:
products_df.head()

### 📣 Available distircts
* The states with the most school districts are CT(29) and UT(24)
* The states with one distric are (FL, TN, NY, AZ)

In [35]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District Of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [36]:
districts_df['state_abbrev'] = districts_df['state'].replace(us_state_abbrev)
districts_df

In [37]:
districts_df_by_state = districts_df['state_abbrev'].value_counts().to_frame().reset_index(drop=False)
districts_df_by_state.head(15)

In [38]:
districts_df_by_state.columns = ['state_abbrev', 'num_districts']

fig = go.Figure()
layout = dict(
    title_text = "Number of Available School Districts per State",
    geo_scope='usa',
)

fig.add_trace(
    go.Choropleth(
        locations=districts_df_by_state.state_abbrev,
        zmax=1,
        z = districts_df_by_state.num_districts,
        locationmode = 'USA-states', # set of locations match entries in `locations`
        marker_line_color='white',
        geo='geo',
        colorscale=px.colors.sequential.Teal, 
    )
)
            
fig.update_layout(layout)   
fig.show()

### 📣Checking the distribution of 'pri_function_main'column.

*  LC(learning & curriculum),CM, and SDO are the main categories in the 'Primary Essential Function'column.

In [39]:
# Check the distribution of 'pri_function_main'
products_df['pri_function_main'].value_counts()

In [40]:
products_df['pri_function_sub'].unique()

In [41]:
# Check the number of data with countplot

#Fig1
fig, ax = plt.subplots(1, 2, figsize=(16,4))
sns.countplot(data=products_df, x='pri_function_main', palette ='GnBu', ax=ax[0])
ax[0].set_title('Main Categories in Primary Functions')


#Fig2
sns.countplot(data=products_df[products_df.pri_function_main == 'LC'], x='pri_function_sub', palette ='GnBu', ax=ax[1])
ax[1].set_title('Sub-Categories in Primary Function LC')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=90)


plt.show()

### 📣Checking the number of data of [pri_function_main] & [pri_function_sub]

In [42]:
# display(products_df.sum())

display(products_df.groupby('pri_function_main')['pri_function_sub'].value_counts().to_frame())

### 📣Let's take a look at the product name in the Virtual Classroom!


In [43]:
 products_df[products_df['pri_function_sub'].isin(['Virtual Classroom'])]

### After summer vacation, the pct_access of Virtual Classroom increased to a higher level as observed in the early stages of infectious diseases and then remained somewhat constant.

In [44]:

virtual_classroom_lp_id = products_df[
                          products_df.pri_function_sub == 'Virtual Classroom']['LP ID'].unique()

# Remove weekends from the dataframe

engagement['weekday'] = pd.DatetimeIndex(engagement['time']).weekday
engagement_without_weekends = engagement[engagement.weekday < 5]

# Figure 1
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 6))
for virtual_classroom_product in virtual_classroom_lp_id:
    temp = engagement_without_weekends[
            engagement_without_weekends.lp_id == virtual_classroom_product].groupby('time').pct_access.mean().to_frame().reset_index(drop=False)
    sns.lineplot(x=temp.time, y=temp.pct_access, 
                 label=products_df[
                 products_df['LP ID'] == virtual_classroom_product]['Product Name'].values[0])
plt.legend()
plt.show()

# Figure 2
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 6))
for virtual_classroom_product in virtual_classroom_lp_id:
    temp = engagement_without_weekends[
            engagement_without_weekends.lp_id == virtual_classroom_product].groupby('time').engagement_index.mean().to_frame().reset_index(drop=False)
    sns.lineplot(x=temp.time, 
                 y=temp.engagement_index, 
                 label=products_df[
                     products_df['LP ID'] == virtual_classroom_product]['Product Name'].values[0])
plt.legend()
plt.show()