In [None]:
#  필요한 라이브러리 불러오기
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import re
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots    
    
    

<iframe src="https://www.kaggle.com/embed/vivek468/will-the-customer-churn?cellId=2&cellIds=2&kernelSessionId=74773094" height="300" style="margin: 0 auto; width: 100%; max-width: 950px;" frameborder="0" scrolling="auto" title="Will the Customer Churn?😥📉"></iframe>

In [None]:
path = os.getcwd()
print(path)

In [None]:
products_df = pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/products_info.csv")
districts_df = pd.read_csv("/kaggle/input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv")

districts_df.shape, products_df.shape

In [None]:
products_df.head()

In [None]:
# engagement_data 폴더의 파일 확인
engagement_df= os.listdir("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data")
engagement_df[0:10]

In [None]:
# 파일명 1000.csv 하나 확인
engagement_1000_df = pd.read_csv("../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/1000.csv")
engagement_1000_df.head()

## 🏷Preprocessing
* Dropping 57 school districts with NaN states (57/233 ~ 25%)
* Creating a list of product names used by each sector in 2020
* One-hot encoding the product sectors
* Splitting up the primary essential function into main and sub categories
* Adding district_id to the engagement_data file
* Verifying the data by date in the engagement_data file.

### 📣dropping 57 school districts with NaN states (57/233 ~ 25%)

In [None]:
# district_df NaN값 확인 
districts_df.isna().sum()

In [None]:
# notna(): 누락값이라면 False를 누락값이 아닌 어떠한 정상적인 값이 입력되어있다면 True를 반환합니다.
# 오리지날 state 값
print(districts_df.shape)
districts_df = districts_df[districts_df.state.notna()].reset_index(drop=True)
# state 의 NAN을 뺀다.
print(districts_df.shape)

### 📣Products name list that used in 2020 by sectors
*  I consider important are: PreK-12와 PreK-12; Higher Ed; Corporate

In [None]:
# Sector의 데이터 종류 찾기 
products_df['Sector(s)'].unique()

In [None]:
products_df['Sector(s)'].value_counts()

In [None]:
# PreK-12
  # 1. PreK-12 행만 뽑는다.
  #  prek12_df라는 변수에 담아준다.   * _df라는 이름을 써서 dafaframe이라는 거를 기억하자!
  # 2. Product Name. unique() 한다. 

prek12_df = products_df[products_df['Sector(s)'].isin(['PreK-12'])]

prek12_df['Product Name'].unique()

In [None]:
# PreK-12; Higher Ed; Corporate
prek12_higher_cor_df = products_df[products_df['Sector(s)'].isin(['PreK-12; Higher Ed; Corporate'])]
prek12_higher_cor_df['Product Name'].unique()

### 📣one-hot encoding the product sectors

In [None]:
temp_sectors = products_df['Sector(s)'].str.get_dummies(sep="; ")   # 문자열 값 중에,  세개의 변수를 만들겠다.
temp_sectors.head()

In [None]:
temp_sectors.columns = [f"sector_{re.sub(' ', '', c)}" for c in temp_sectors.columns]  # 공백을 없애고, 

In [None]:
temp_sectors.columns

In [None]:
products_df = products_df.join(temp_sectors)
products_df.head()

In [None]:
products_df.drop("Sector(s)", axis=1, inplace=True)
print(products_df.columns)
del temp_sectors

### 📣 Separating main and sub functions in Primary Essential Function

In [None]:
products_df['pri_function_main'] = products_df['Primary Essential Function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
products_df['pri_function_sub'] = products_df['Primary Essential Function'].apply(lambda x: x.split(' - ')[1] if x == x else x)

In [None]:
products_df.head()

In [None]:
# Synchronize similar values
products_df['pri_function_sub'] = products_df['pri_function_sub'].replace(
          {'Sites, Resources & References' : 'Sites, Resources & Reference'})
products_df.drop("Primary Essential Function", axis=1, inplace=True)

In [None]:
products_df['pri_function_sub'].unique()

In [None]:
products_df[ ['sector_Corporate', 'sector_HigherEd', 'sector_PreK-12',
       'pri_function_main', 'pri_function_sub'] ]

### 📣Adding district_id in engagement_data file

In [None]:
districts_df.district_id.unique()

In [None]:
PATH = "../input/learnplatform-covid19-impact-on-digital-learning/engagement_data"

temp = []

for district in districts_df.district_id.unique():
    df = pd.read_csv(f'{PATH}/{district}.csv', index_col=None, header=0)
    df['district_id'] = district
    temp.append(df)

len(temp)

In [None]:

engagement = pd.concat(temp)
engagement = engagement.reset_index(drop=True)
engagement.head()


## EDA(Exploratory Data Analysis)
### 📣Verifying the data by date in the engagement_data file
* Most school districts have 366 unique dates available in the engagement_data file.  
* However, there are 43 school districts that have less than 366 unique dates available in the data.  
* For district_id 3670, only the data from 2020-02-15 to 2020-03-02 is available.  
* For district_id 2872, only the data from January 2020 is available, and only the data from February 1st and March 1st are available for each of those months, respectively, for a total of two days of data available.  

In [None]:
len(engagement.district_id.unique())

In [None]:
engagement.info()

In [None]:
# 아무 district 하나 찍어서(3670) 몇개의 time 데이터가 있는지 확인
engagement[engagement['district_id']==3670].time.unique()

# 366일이 아님. 

### ✔ Plotting a histogram of the count of available dates (366 days) for each district_id

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,4))

sns.histplot(engagement.groupby('district_id').time.nunique(), bins=30)
ax.set_title('Unique Days of Engagement Data per District')
plt.show()

In [None]:
# 앞에서 확인한 engagement를 지우고, 366일 인것만 새롭게 만든다.
del engagement

temp = []

for district in districts_df.district_id.unique():
    df = pd.read_csv(f'{PATH}/{district}.csv', index_col=None, header=0)
    df["district_id"] = district
    if df.time.nunique() == 366:    # 366일인 것만 추가하겠다. 
        temp.append(df)

engagement = pd.concat(temp)   # 366일인것만 뽑아서 pandas로 찍기 
engagement = engagement.reset_index(drop=True)

In [None]:
districts_df.shape, products_df.shape

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,4))

sns.histplot(engagement.groupby('district_id').time.nunique(), bins=30)
ax.set_title('Unique Days of Engagement Data per District')
plt.show()

### ✔ Combining data from the 366-day datasets only
* Merge the column names with the corresponding file names.

In [None]:
# isin()메소드는 해당 열에서 필요한 데이터(행)만 가져오는 것
# districts_df를 366일 있는 distric_id로 채운 것으로 다시 재 정의 
districts_df = districts_df[districts_df.district_id.isin(engagement.district_id.unique())].reset_index(drop=True)
# products_df를 366일 있는 lp_id로 채운 것으로 다시 재 정의 
products_df = products_df[products_df['LP ID'].isin(engagement.lp_id.unique())].reset_index(drop=True)

In [None]:
products_df.head()

### 📣 Identifying school districts available in the dataset
* The states with the highest number of school districts available in the dataset are CT (29) and UT (24).
* There are states with only one school district available in the dataset, including FL, TN, NY, and AZ.

In [None]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District Of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [None]:
districts_df['state_abbrev'] = districts_df['state'].replace(us_state_abbrev)
districts_df

In [None]:
districts_df_by_state = districts_df['state_abbrev'].value_counts().to_frame().reset_index(drop=False)
districts_df_by_state.head(15)

In [None]:
districts_df_by_state.columns = ['state_abbrev', 'num_districts']

fig = go.Figure()
layout = dict(
    title_text = "Number of Available School Districts per State",
    geo_scope='usa',
)

fig.add_trace(
    go.Choropleth(
        locations=districts_df_by_state.state_abbrev,
        zmax=1,
        z = districts_df_by_state.num_districts,
        locationmode = 'USA-states', # set of locations match entries in `locations`
        marker_line_color='white',
        geo='geo',
        colorscale=px.colors.sequential.Teal, 
    )
)
            
fig.update_layout(layout)   
fig.show()

### 📣 Checking the distribution of pri_function_main
The most common categories in the 'Primary Essential Function' column are LC (learning & curriculum), Classroom Management (CM), and School and District Operations (SDO).

In [None]:
# pri_function_main의 분포도 확인
products_df['pri_function_main'].value_counts()

In [None]:
products_df['pri_function_sub'].unique()

In [None]:
#countplot으로 개수 찍어보기

#Fig1
fig, ax = plt.subplots(1, 2, figsize=(16,4))
sns.countplot(data=products_df, x='pri_function_main', palette ='GnBu', ax=ax[0])
ax[0].set_title('Main Categories in Primary Functions')


#Fig2
sns.countplot(data=products_df[products_df.pri_function_main == 'LC'], x='pri_function_sub', palette ='GnBu', ax=ax[1])
ax[1].set_title('Sub-Categories in Primary Function LC')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=90)


plt.show()

### 📣[pri_function_main] 와 [pri_function_sub]별 데이터 수 확인

In [None]:
# display(products_df.sum())

display(products_df.groupby('pri_function_main')['pri_function_sub'].value_counts().to_frame())

### 📣 Let's take a look at the product names that correspond to Virtual Classroom!

In [None]:
 products_df[products_df['pri_function_sub'].isin(['Virtual Classroom'])]

### After the summer vacation, pct_access for Virtual Classroom increases to a higher level as observed in the early stages of the pandemic and remains relatively stable.

In [None]:
#  virtual_classroom에 해당하는 LP ID를 찾는다. 
virtual_classroom_lp_id = products_df[
                          products_df.pri_function_sub == 'Virtual Classroom']['LP ID'].unique()

# Remove weekends from the dataframe
#.weekday() 정수로 요일을 반환합니다. 월요일은 0이고 일요일은 6입니다.
engagement['weekday'] = pd.DatetimeIndex(engagement['time']).weekday
engagement_without_weekends = engagement[engagement.weekday < 5]

# Figure 1
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 6))
for virtual_classroom_product in virtual_classroom_lp_id:
    temp = engagement_without_weekends[
            engagement_without_weekends.lp_id == virtual_classroom_product].groupby('time').pct_access.mean().to_frame().reset_index(drop=False)
    sns.lineplot(x=temp.time, y=temp.pct_access, 
                 label=products_df[
                 products_df['LP ID'] == virtual_classroom_product]['Product Name'].values[0])
plt.legend()
plt.show()

# Figure 2
f, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 6))
for virtual_classroom_product in virtual_classroom_lp_id:
    temp = engagement_without_weekends[
            engagement_without_weekends.lp_id == virtual_classroom_product].groupby('time').engagement_index.mean().to_frame().reset_index(drop=False)
    sns.lineplot(x=temp.time, 
                 y=temp.engagement_index, 
                 label=products_df[
                     products_df['LP ID'] == virtual_classroom_product]['Product Name'].values[0])
plt.legend()
plt.show()

### ⬆ Key findings
* Homeschooling started in early March.
* There is a dip between March and July.
* There are no classes during the summer break in July and August, resulting in no attendance.
* After the summer break, pct_access increases to a higher level, similar to what was observed during the initial outbreak of the pandemic, and remains somewhat stable.
* There are some drops in pct_access throughout the year, possibly due to holidays or other breaks.
* Zoom and Meet are the most popular products in virtual classrooms.