In [1]:
# 讀取numpy函式庫
import numpy as np
# 用np內的random取代python的random
import numpy.random as random

# 讀取scipy函式庫
import scipy as sp
# 用於線性代數的函式庫
import scipy.linalg as linalg
# 用於最佳化計算(最小值)的函式
from scipy.optimize import minimize_scalar

# 讀取pandas函式庫
import pandas as pd
# 處理1維陣列的Series函式庫與處理2維陣列的DataFrame函式庫
from pandas import Series, DataFrame

# 讓pyplot能以別名plt來執行，多數的描繪功能都在pyplot內
import matplotlib.pyplot as plt
# 讀取matplotlib函式庫
import matplotlib as mpl
# seaborn函式庫能讓圖表更美觀
import seaborn as sns
sns.set()
# 在jupyter notebook上顯示圖表所必須的magic command
%matplotlib inline

from sklearn import  linear_model

# 意指顯示到小數點後第三位
%precision 3

'%.3f'

### 資料的讀取與對話

#### 讀取網路等處公開的對象資料

確認當前目錄

In [11]:
pwd

'E:\\PythonProject\\Study-PyCharm'

建立資料夾

In [6]:
mkdir chap3

移動到資料夾

In [12]:
cd ./chap3

E:\PythonProject\Study-PyCharm\chap3


下載ZIP等檔案的函式庫

In [14]:
# 從Web取得資料、處理ZIP檔案格式的函式庫
# requests:接收與傳送Web的資料
# zipfile:讀取ZIP格式的檔案
import requests, zipfile
# io:讀取檔案
from io import StringIO
import io

下載ZIP檔案並解壓縮

In [15]:
# 指定資料的url
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00356/student.zip'
# 從url取得資料
r = requests.get(url, stream=True)
# 讀取zipfile並展開
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

顯示當前目錄裡的檔案列表

In [16]:
ls

 磁碟區 E 中的磁碟是 Data 1.0T
 磁碟區序號:  463C-E2F8

 E:\PythonProject\Study-PyCharm\chap3 的目錄

2020/09/29  上午 10:04    <DIR>          .
2020/09/29  上午 10:04    <DIR>          ..
2020/09/29  上午 10:04             3,206 student.txt
2020/09/29  上午 10:04            56,993 student-mat.csv
2020/09/29  上午 10:04               269 student-merge.R
2020/09/29  上午 10:04            93,220 student-por.csv
               4 個檔案         153,688 位元組
               2 個目錄  768,140,279,808 位元組可用


#### 資料的讀取與確認

In [22]:
# 將資料讀取為DataFrame:pd.read_csv('檔案')
student_data_math = pd.read_csv('student-mat.csv')
# 確認資料:.head(幾個row)
student_data_math.head(5)

Unnamed: 0,school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3
0,"GP;""F"";18;""U"";""GT3"";""A"";4;4;""at_home"";""teacher..."
1,"GP;""F"";17;""U"";""GT3"";""T"";1;1;""at_home"";""other"";..."
2,"GP;""F"";15;""U"";""LE3"";""T"";1;1;""at_home"";""other"";..."
3,"GP;""F"";15;""U"";""GT3"";""T"";4;2;""health"";""services..."
4,"GP;""F"";16;""U"";""GT3"";""T"";3;3;""other"";""other"";""h..."


In [25]:
# 使用逗點來區隔並讀取資料
# 讀取資料
# 使用;區隔
student_data_math = pd.read_csv('student-mat.csv', sep=';')
student_data_math.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


#### 確認資料的性質

In [26]:
# 確認資料的個數與型別
# 確認所有column的資訊
student_data_math.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

#### 量的資料與質的資料

In [29]:
# 質的資料
print(student_data_math['sex'].head())
# 量的資料
print(student_data_math['absences'].head())

0    F
1    F
2    F
3    F
4    F
Name: sex, dtype: object
0     6
1     4
2    10
3     2
4     4
Name: absences, dtype: int64


In [30]:
# 以性別為基礎，求年齡的平均
student_data_math.groupby('sex')['age'].mean()

sex
F    16.730769
M    16.657754
Name: age, dtype: float64