# Data Mining Assignment2

## 1. 数据收集

In [None]:

from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
import numpy as np
from datetime import timedelta

# The url for the data of Covid-19
url = "https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=chinaDayList,chinaDayAddList,nowConfirmStatis,provinceCompare"
soup = BeautifulSoup(urlopen(url), 'html.parser') # Use web crawler
res = json.loads(soup.text) # Store in json format
# Read the data I want
chinaDayList = res['data']['chinaDayList']
chinaDayAddList = res['data']['chinaDayAddList']


## 2. 创建DataFrame存储数据

In [None]:
# Create a dataframe
df = pd.DataFrame(columns=['index', 'confirm', 'suspect', 'dead', 'heal', 'nowConfirm', 'localConfirmadd', 'time'])

# Read the past 60 days' data
for i in range(60):

    # Pick out the month, day and year
    month = chinaDayList[i]['date'].split('.')[0]
    day = chinaDayList[i]['date'].split('.')[1]
    year = chinaDayList[i]['y']
    # Merge the month, day and year into the following format
    time = year + '-' + month + '-' + day # eg: '2022-03-20'

    # Write each day's data into df
    df.loc[i + 1] = [i + 1, # 'index' column makes it easier to do linear regression
                    chinaDayList[i]['confirm'],
                    chinaDayList[i]['suspect'],  
                    chinaDayList[i]['dead'],  
                    chinaDayList[i]['heal'],  
                    chinaDayList[i]['nowConfirm'],
                    chinaDayAddList[i]['localConfirmadd'],
                    time
                    ]

# Add a new row for df so that we can input the latest day
last_day = df.iloc[59].time 
last_day_ts = pd.to_datetime(last_day, format='%Y/%m/%d') # Convert string to timeStamp
one_more_day_ts = last_day_ts + timedelta(days=1)
one_more_day = str(one_more_day_ts).split(' ')[0] # Convert timeStamp to string
df.loc[61] = [61, '', '', '', '', '', '', one_more_day] # Add a row

df

## 3. 清洗数据

In [None]:
# Remove missing values in the dataframe
df.dropna(subset=['confirm', 'suspect', 'dead', 'heal', 'nowConfirm', 'localConfirmadd'], inplace=True)
df


## 4. 输入希望预测的日期并截取30天数据

In [None]:
# Input the date you want to predict as target_date
target_date = '2022-04-26' # This is an example input
target_date_ts = pd.to_datetime(target_date, format='%Y/%m/%d') # The target date in the format of timeStamp

start_date_ts = target_date_ts - timedelta(days=30) # 30 days from the target is the start date
start_date = str(start_date_ts).split(' ')[0]

start_index = df[(df.time == start_date)].index.tolist()[0] # Get the index of start date

df2 = df.iloc[start_index - 1 : start_index + 29] # Pick 30 days before the target date
# hint: (start_index - 1 : start_index + 29]

df2

## 5. 进行预测和评估

In [None]:
lst = ['confirm', 'suspect', 'dead', 'heal', 'nowConfirm', 'localConfirmadd']

# Traverse each indicator
for indicator in lst:

    # Use index number to predict the value of indicators
    x = df2[['index']].values
    y = df2[[indicator]].values 

    # Split data into testing and training set
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # 20% for testing

    # Start linear regression training
    lm = linear_model.LinearRegression()
    model = lm.fit(x_train, y_train)

    y_pred = lm.predict(x_test) # Use test cases x to generate y
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # Calculate RMSE

    target_index = df[(df.time == target_date)].index.tolist()[0] # Get the index of target date

    result = model.predict([[target_index]]) # Use target date's index to predict value

    # Output results
    print('RMSE of '+indicator+ ':', rmse)
    print('Predicted ' + indicator + ' cases:', int(result[0][0])) 
    if lm.intercept_[0] >= 0:
        print('Linear Fitting Formula of ' + indicator + ': y =', str(lm.coef_[0][0])+'x +', str(lm.intercept_[0]))
    else:
        print('Linear Fitting Formula of ' + indicator + ': y =', str(lm.coef_[0][0])+'x', str(lm.intercept_[0]))
    
    print()
