# DAY1（读取数据）

## 如何导入csv数据：`pandas.read_csv`

> CSV（逗号分隔值）是一种纯文本文件格式，用于存储表格数据（例如电子表格或数据库）。
> 它本质上存储的表格数据包括数字和纯文本。

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 获取路径
import os
path = os.getcwd()
csv_name = "demo.csv"
csv_path = os.path.join(path, f"data\{csv_name}")
# print(csv_path)

In [3]:
csv_url = 'https://raw.githubusercontent.com/datoujinggzj/DataScienceCrashCourse/master/data/demo.csv'

In [4]:
pd.read_csv(csv_url)

Unnamed: 0,Date,New Cases,Total Cases,Recovered,Deaths,Active Cases,Year,Month,Day
0,06/03/2020,1,1,0,0,1,2020,6,3
1,07/03/2020,2,3,0,0,3,2020,7,3
2,08/03/2020,0,3,0,0,3,2020,8,3
3,09/03/2020,1,4,0,0,4,2020,9,3
4,10/03/2020,1,5,0,0,5,2020,10,3
...,...,...,...,...,...,...,...,...,...
816,31/05/2022,99,94742,92215,721,1410,2022,5,31
817,01/06/2022,115,94857,92310,721,1430,2022,1,6
818,02/06/2022,93,94950,92408,722,1424,2022,2,6
819,03/06/2022,118,95068,92512,722,1438,2022,3,6


### 1.1 读取原始数据

In [5]:
pd.read_csv(csv_path)

Unnamed: 0,Date,New Cases,Total Cases,Recovered,Deaths,Active Cases,Year,Month,Day
0,06/03/2020,1,1,0,0,1,2020,6,3
1,07/03/2020,2,3,0,0,3,2020,7,3
2,08/03/2020,0,3,0,0,3,2020,8,3
3,09/03/2020,1,4,0,0,4,2020,9,3
4,10/03/2020,1,5,0,0,5,2020,10,3
...,...,...,...,...,...,...,...,...,...
816,31/05/2022,99,94742,92215,721,1410,2022,5,31
817,01/06/2022,115,94857,92310,721,1430,2022,1,6
818,02/06/2022,93,94950,92408,722,1424,2022,2,6
819,03/06/2022,118,95068,92512,722,1438,2022,3,6


### 1.2 即默认第n+1行为列名：`header = n`

In [6]:
pd.read_csv(csv_path,header = 1)

Unnamed: 0,06/03/2020,1,1.1,0,0.1,1.2,2020,6,3
0,07/03/2020,2,3,0,0,3,2020,7,3
1,08/03/2020,0,3,0,0,3,2020,8,3
2,09/03/2020,1,4,0,0,4,2020,9,3
3,10/03/2020,1,5,0,0,5,2020,10,3
4,11/03/2020,2,7,0,0,7,2020,11,3
...,...,...,...,...,...,...,...,...,...
815,31/05/2022,99,94742,92215,721,1410,2022,5,31
816,01/06/2022,115,94857,92310,721,1430,2022,1,6
817,02/06/2022,93,94950,92408,722,1424,2022,2,6
818,03/06/2022,118,95068,92512,722,1438,2022,3,6


### 1.3 改变列名（或为没有列名的数据赋予列名）：`names = ["列名1","列名2","列名3"..."列名n"]`

In [7]:
pd.read_csv(csv_path,header = 0,names = ['日期','新增','累积','恢复','死亡','现有病例','年','月','日'])

Unnamed: 0,日期,新增,累积,恢复,死亡,现有病例,年,月,日
0,06/03/2020,1,1,0,0,1,2020,6,3
1,07/03/2020,2,3,0,0,3,2020,7,3
2,08/03/2020,0,3,0,0,3,2020,8,3
3,09/03/2020,1,4,0,0,4,2020,9,3
4,10/03/2020,1,5,0,0,5,2020,10,3
...,...,...,...,...,...,...,...,...,...
816,31/05/2022,99,94742,92215,721,1410,2022,5,31
817,01/06/2022,115,94857,92310,721,1430,2022,1,6
818,02/06/2022,93,94950,92408,722,1424,2022,2,6
819,03/06/2022,118,95068,92512,722,1438,2022,3,6


### 1.4 添加第n+1列为索引：`index_col=n`

In [8]:
pd.read_csv(csv_path,header = 0,names = ['日期','新增','累积','恢复','死亡','现有病例','年','月','日'], index_col=0)

Unnamed: 0_level_0,新增,累积,恢复,死亡,现有病例,年,月,日
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
06/03/2020,1,1,0,0,1,2020,6,3
07/03/2020,2,3,0,0,3,2020,7,3
08/03/2020,0,3,0,0,3,2020,8,3
09/03/2020,1,4,0,0,4,2020,9,3
10/03/2020,1,5,0,0,5,2020,10,3
...,...,...,...,...,...,...,...,...
31/05/2022,99,94742,92215,721,1410,2022,5,31
01/06/2022,115,94857,92310,721,1430,2022,1,6
02/06/2022,93,94950,92408,722,1424,2022,2,6
03/06/2022,118,95068,92512,722,1438,2022,3,6


### 1.5 取其中某些列：`usecols = ["列名1","列名2","列名3"..."列名m"]`

In [9]:
pd.read_csv(csv_path,
            header = 0,
            names = ['日期','新增','累积','恢复','死亡','现有病例','年','月','日'], 
            index_col=0, 
            usecols = ['日期','新增','累积'])

Unnamed: 0_level_0,新增,累积
日期,Unnamed: 1_level_1,Unnamed: 2_level_1
06/03/2020,1,1
07/03/2020,2,3
08/03/2020,0,3
09/03/2020,1,4
10/03/2020,1,5
...,...,...
31/05/2022,99,94742
01/06/2022,115,94857
02/06/2022,93,94950
03/06/2022,118,95068


### 1.6 取前n行：`nrows = n`

In [10]:
pd.read_csv(csv_path,
            header = 0,
            names = ['日期','新增','累积','恢复','死亡','现有病例','年','月','日'], 
            index_col=0, 
            usecols = ['日期','新增','累积'],
            nrows = 100)

Unnamed: 0_level_0,新增,累积
日期,Unnamed: 1_level_1,Unnamed: 2_level_1
06/03/2020,1,1
07/03/2020,2,3
08/03/2020,0,3
09/03/2020,1,4
10/03/2020,1,5
...,...,...
09/06/2020,2,632
10/06/2020,3,635
11/06/2020,5,640
12/06/2020,5,645


### 1.7 时间转换

In [11]:
pd.read_csv(csv_path,
            header = 0,
            names = ['日期','新增','累积','恢复','死亡','现有病例','年','月','日'], 
            parse_dates = [['年','月','日']],
            infer_datetime_format=True, # 可显著减少read_csv命令日期解析时间
            keep_date_col=True)         # 保存之前的日期格式

Unnamed: 0,年_月_日,日期,新增,累积,恢复,死亡,现有病例,年,月,日
0,2020-06-03,06/03/2020,1,1,0,0,1,2020,6,3
1,2020-07-03,07/03/2020,2,3,0,0,3,2020,7,3
2,2020-08-03,08/03/2020,0,3,0,0,3,2020,8,3
3,2020-09-03,09/03/2020,1,4,0,0,4,2020,9,3
4,2020-10-03,10/03/2020,1,5,0,0,5,2020,10,3
...,...,...,...,...,...,...,...,...,...,...
816,2022-05-31,31/05/2022,99,94742,92215,721,1410,2022,5,31
817,2022-01-06,01/06/2022,115,94857,92310,721,1430,2022,1,6
818,2022-02-06,02/06/2022,93,94950,92408,722,1424,2022,2,6
819,2022-03-06,03/06/2022,118,95068,92512,722,1438,2022,3,6


### 读取json

JSON(JavaScript Object Notation) 是一种轻量级的数据交换格式。它基于ECMAScript的一个子集。 JSON采用完全独立于语言的文本格式，但是也使用了类似于C语言家族的习惯(包括C、C++、Java、JavaScript、Perl、Python等)。这些特性使JSON成为理想的数据交换语言。易于人阅读和编写，同时也易于机器解析和生成(一般用于提升网络传输速率)。

JSON结构看起来和Python中的字典非常类似。需要注意的是，JSON格式通常是由key: 结对组成,其中key是字符串形式,value是字符串、数字、布尔值、数组、对象或null。

JSON在python中分别由list和dict组成。

- 名称/值对：代表数据，名称后跟'：'（冒号），名称/值对以逗号分隔。
- 大括号：容纳对象。
- 中括号：保留由（，）分隔的值的数组。

In [12]:
# 需要导入 json库, 接着我们使用open函数来读取JSON文件,最后利用json.load()函数将JSON字符串转化为Python字典形式
import json

with open(r'.\data\superheroes.json') as f:
    superHeroSquad = json.load(f)

print(type(superHeroSquad))  # Output: dict
print(superHeroSquad.keys())
# Output: dict_keys(['squadName', 'homeTown', 'formed', 'secretBase', 'active', 'members'])

<class 'dict'>
dict_keys(['squadName', 'homeTown', 'formed', 'secretBase', 'active', 'members'])


In [13]:
df = pd.read_json('https://mdn.github.io/learning-area/javascript/oojs/json/superheroes.json')
df

Unnamed: 0,squadName,homeTown,formed,secretBase,active,members
0,Super Hero Squad,Metro City,2016,Super tower,True,"{'name': 'Molecule Man', 'age': 29, 'secretIde..."
1,Super Hero Squad,Metro City,2016,Super tower,True,"{'name': 'Madame Uppercut', 'age': 39, 'secret..."
2,Super Hero Squad,Metro City,2016,Super tower,True,"{'name': 'Eternal Flame', 'age': 1000000, 'sec..."


In [14]:
def test():
    with open(r'.\data\superheroes.json') as f:
        superHeroSquad = json.load(f)
    out = pd.json_normalize(superHeroSquad, record_path=['members'],
                    meta=['squadName', 'homeTown', 'formed', 'secretBase', 'active'],
                    meta_prefix = 'members_')
    return out

- `record_path`为我们希望拆分的列的名字
- `meta`为列名的list，为我们输出的次序

In [15]:
test()

Unnamed: 0,name,age,secretIdentity,powers,members_squadName,members_homeTown,members_formed,members_secretBase,members_active
0,Molecule Man,29,Dan Jukes,"[Radiation resistance, Turning tiny, Radiation...",Super Hero Squad,Metro City,2016,Super tower,True
1,Madame Uppercut,39,Jane Wilson,"[Million tonne punch, Damage resistance, Super...",Super Hero Squad,Metro City,2016,Super tower,True
2,Eternal Flame,1000000,Unknown,"[Immortality, Heat Immunity, Inferno, Teleport...",Super Hero Squad,Metro City,2016,Super tower,True


In [16]:
superHeroSquad_copy = superHeroSquad.copy()
superHeroSquad_copy['members'][2]['secretIdentity'] = 'jing jing'
with open(r'.\data\superheroes_change.json', 'w') as file:
    json.dump(superHeroSquad, file)

In [17]:
# df.to_json('superheroes.json')

In [18]:
import json
 
dicts={"2name":"lucy","1sex":"boy"}
 
json_dicts=json.dumps(dicts)
#将Python的字典数据转换成json字符
print(json_dicts)

{"2name": "lucy", "1sex": "boy"}


In [19]:
import json
 
dicts={"2name":"lucy","1sex":"boy"}
 
json_dicts=json.dumps(dicts,indent=4,sort_keys=True)
print(json_dicts)

{
    "1sex": "boy",
    "2name": "lucy"
}


## 读取html

```python
pandas.read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=None, thousands=', ', encoding=None, decimal='.', converters=None, na_values=None, keep_default_na=True, displayed_only=True)
```

常用的参数：
- io:可以是url、html文本、本地文件等；
- flavor：解析器；
- header：标题行；
- skiprows：跳过的行；
- attrs：属性，比如 attrs = {'id': 'table'}；
- parse_dates：解析日期

注意：返回的结果是**DataFrame**组成的**list**。

In [20]:
url = "https://fund.eastmoney.com/fund.html#os_0;isall_0;ft_;pt_1"
table = pd.read_html(url, attrs = {'id': 'oTable'}, header=1)
type(table) 
# list
len(table)
# 1
table[0]

Unnamed: 0,关注,比较,序号,基金代码,基金简称,单位净值,累计净值,单位净值.1,累计净值.1,日增长值,日增长率,申购状态,赎回状态,手续费
0,,,1,1678,英大国企改革主题股票估值图基金吧,1.3970,2.0470,1.3360,1.9860,0.0610,4.57%,开放,开放,0.15%
1,,,2,2910,易方达供给改革混合估值图基金吧,2.8929,2.8929,2.7688,2.7688,0.1241,4.48%,开放,开放,0.15%
2,,,3,519183,万家双引擎灵活配置混合估值图基金吧,2.4229,3.1129,2.3206,3.0106,0.1023,4.41%,开放,开放,0.15%
3,,,4,161032,富国中证煤炭指数(LOF)A估值图基金吧,2.0110,1.4010,1.9300,1.3450,0.0810,4.20%,开放,开放,0.00%
4,,,5,13275,富国中证煤炭指数(LOF)C估值图基金吧,2.0080,2.0080,1.9270,1.9270,0.0810,4.20%,开放,开放,0.00%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,,,196,7675,工银产业升级股票C估值图基金吧,1.3697,1.3697,1.3356,1.3356,0.0341,2.55%,开放,开放,0.00%
196,,,197,11355,华泰柏瑞港股通时代机遇混合A估值图基金吧,0.6432,0.6432,0.6272,0.6272,0.0160,2.55%,开放,开放,0.15%
197,,,198,11356,华泰柏瑞港股通时代机遇混合C估值图基金吧,0.6386,0.6386,0.6227,0.6227,0.0159,2.55%,开放,开放,0.00%
198,,,199,13919,建信中小盘先锋股票C估值图基金吧,4.3320,4.3320,4.2250,4.2250,0.1070,2.53%,限大额,开放,0.00%


---

## 课后习题 

### Q1

读取数据（链接），并且随机跳过（skip）90%的行（seed=1）

https://github.com/datoujinggzj/WhaleDataScienceProject/blob/master/Python_data/ufo.csv

In [21]:
url1 = 'https://raw.githubusercontent.com/datoujinggzj/WhaleDataScienceProject/master/Python_data/ufo.csv'

In [22]:
pd.read_csv(url1, index_col=0, skiprows = lambda x: x>0 and x%2==0)

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00
8,Eklutna,,CIGAR,AK,10/15/1936 17:00
...,...,...,...,...,...
18232,Lodi,,,WI,12/31/2000 20:30
18234,Capitola,,TRIANGLE,CA,12/31/2000 22:00
18236,Grant Park,,TRIANGLE,IL,12/31/2000 23:00
18238,Eagle River,,,WI,12/31/2000 23:45


In [23]:
def logic(index):
    if index>0 and np.random.rand()>0.1:
        return True
    return False

np.random.seed(1)
pd.read_csv(url1,index_col = 0, skiprows = lambda x: logic(x))

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
2,Holyoke,,OVAL,CO,2/15/1931 14:00
5,Valley City,,DISK,ND,9/15/1934 15:30
14,Ludington,,DISK,MI,6/1/1941 13:00
26,Wilderness,,DISK,WV,1/1/1944 12:00
27,Clovis,,DISK,NM,4/2/1944 11:00
...,...,...,...,...,...
18166,Mojave,,TRIANGLE,CA,12/21/2000 17:54
18204,Bremerton,,OTHER,WA,12/28/2000 3:00
18214,Austin,,FORMATION,TX,12/29/2000 0:00
18216,Garden Grove,ORANGE,LIGHT,CA,12/29/2000 16:10


In [24]:
# 点击raw，获取raw链接才行，不然不可以直接读url
np.random.seed(1)
# pd.read_csv(url1,index_col = 0, skiprows = lambda x: x>0 and np.random.rand()>0.1)

### Q2

读取数据（链接），并随机从0-100行中挑选10行去掉（seed=2022）

In [25]:
url2 = 'https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/'
np.random.seed(2022)
pd.read_html(url2,skiprows = np.random.randint(101,size = 10))[0]

Unnamed: 0,Bank NameBank,CityCity,StateSt,CertCert,Acquiring InstitutionAI,Closing DateClosing,FundFund
0,Almena State Bank,Almena,KS,15426,Equity Bank,"October 23, 2020",10538
1,First City Bank of Florida,Fort Walton Beach,FL,16748,"United Fidelity Bank, fsb","October 16, 2020",10537
2,The First State Bank,Barboursville,WV,14361,"MVB Bank, Inc.","April 3, 2020",10536
3,Ericson State Bank,Ericson,NE,18265,Farmers and Merchants Bank,"February 14, 2020",10535
4,City National Bank of New Jersey,Newark,NJ,21111,Industrial Bank,"November 1, 2019",10534
...,...,...,...,...,...,...,...
548,"Superior Bank, FSB",Hinsdale,IL,32646,"Superior Federal, FSB","July 27, 2001",6004
549,Malta National Bank,Malta,OH,6629,North Valley Bank,"May 3, 2001",4648
550,First Alliance Bank & Trust Co.,Manchester,NH,34264,Southern New Hampshire Bank & Trust,"February 2, 2001",4647
551,National State Bank of Metropolis,Metropolis,IL,3815,Banterra Bank of Marion,"December 14, 2000",4646


### Q3

读取数据（链接），爬取前5页数据

http://s.askci.com/stock/a/?reportTime=2022-06-01

In [26]:
import pandas as pd
import csv
for i in range(1,5+1):  # 爬取前5页数据
    #url3 = 'http://s.askci.com/stock/a/?reportTime=2022-06-01&pageNum=%s' % (str(i))
    url3 = f'http://s.askci.com/stock/a/?reportTime=2022-06-01&pageNum={i}'
    tb = pd.read_html(url3)[3] #网页的第4个表格
    tb.to_csv(r'day1_q3.csv', mode='a', encoding='utf_8_sig', header=1, index=0)
    print('第'+str(i)+'页DONE')

第1页DONE
第2页DONE
第3页DONE
第4页DONE
第5页DONE


In [27]:
df = pd.read_csv('day1_q3.csv')
# 我们要去掉所有重复行（列名作为行被写进来了，我们去掉）
df.drop_duplicates(keep=False)

Unnamed: 0,序号,股票代码,股票简称,公司名称,省份,城市,主营业务收入(202206),净利润(202206),员工人数,上市日期,招股书,公司财报,行业分类,产品类型,主营业务
0,1,1,平安银行,平安银行股份有限公司,广东,深圳市,--,--,40651,1991-04-03,--,,银行,商业银行业务,经有关监管机构批准的各项商业银行业务。
1,2,2,万科A,万科企业股份有限公司,广东,深圳市,--,--,139494,1991-01-29,--,,房地产开发,房地产、物业管理、投资咨询,房地产开发和物业服务。
2,3,4,ST国华,深圳国华网安科技股份有限公司,广东,深圳市,--,--,309,1991-01-14,--,,生物医药,应急类产品、安全加固检测类、安全检测类产品、集成产品、智慧类产品、移动应用安全检测平台、移动...,移动互联网安全、智慧城市及应急业务。
3,4,5,ST星源,深圳世纪星源股份有限公司,广东,深圳市,--,--,563,1990-12-10,--,,环保工程、物业管理,酒店经营、物业管理、环保业务,绿色低碳城市社区建设相关的服务业务。
4,5,6,深振业A,深圳市振业(集团)股份有限公司,广东,深圳市,--,--,403,1992-04-27,--,,房地产开发,房地产,从事房地产开发与销售。
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,96,488,晨鸣纸业,山东晨鸣纸业集团股份有限公司,山东,潍坊市,--,--,12296,2000-11-20,--,,造纸,制浆、造纸,机制纸及板纸和造纸原料、造纸机械、电力、热力的生产与销售
100,97,498,山东路桥,山东高速路桥集团股份有限公司,山东,济南市,--,--,24686,1997-06-09,--,,路桥建设,路桥工程施工、路桥养护施工、周转材料及设备租赁销售、商品混凝土加工销售、工程设计咨询,路桥工程施工与养护施工。
101,98,501,武商集团,武商集团股份有限公司,湖北,武汉市,--,--,9373,1992-11-20,--,,百货,商品销售,从事购物中心及超市业态的商品销售业务。
102,99,502,绿景退,绿景控股股份有限公司,广东,广州市,--,--,148,1992-11-23,--,,房地产开发,房地产业(房产销售及租赁)、房地产业(物业管理)、医疗服务,从事房地产开发及物业管理业务、互联网数据中心业务。
