In [2]:
# Pandas的axis参数理解篇（十）

# 1、Pandas的axis参数怎么理解？
# （1）axis=0或者"index"：
# 如果是单行操作，就指的是某一行
# 如果是聚合操作，指的是跨行cross rows
# （2）axis=1或者"columns"：
# 如果是单列操作，就指的是某一列
# 如果是聚合操作，指的是跨列cross columns
# 按哪个axis，就是这个axis要动起来(类似被for遍历)，其它的axis保持不动
# 按哪个axis，就是这个axis要动起来(类似被for遍历)，其它的axis保持不动

import pandas as pd
import numpy as np
df = pd.DataFrame(
    np.arange(12).reshape(3,4),
    columns=['A', 'B', 'C', 'D']
)
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [8]:
# （2）单列drop，就是删除某一列
df.drop('A',axis=1)

Unnamed: 0,B,C,D
0,1,2,3
1,5,6,7
2,9,10,11


In [10]:
df.drop(1,axis=0)

Unnamed: 0,A,B,C,D
0,0,1,2,3
2,8,9,10,11


In [11]:
# 4）按axis=0/index执行mean聚合操作
# 反直觉：输出的不是每行的结果，而是每列的结果

# axis=0 or axis=index
df.mean(axis=0)

A    4.0
B    5.0
C    6.0
D    7.0
dtype: float64

In [12]:
# 指定了按哪个axis，就是这个axis要动起来(类似被for遍历)，其它的axis保持不动

# （5）按axis=1/columns执行mean聚合操作
# 反直觉：输出的不是每行的结果，而是每列的结果

# axis=1 or axis=columns
df.mean(axis=1)
# 指定了按哪个axis，就是这个axis要动起来(类似被for遍历)，其它的axis保持不动

0    1.5
1    5.5
2    9.5
dtype: float64

In [14]:
# （6）再次举例，加深理解
def get_sum_value(x):
    return x["A"] + x["B"] + x["C"] + x["D"]

df["sum_value"] = df.apply(get_sum_value, axis=1)
df
# 指定了按哪个axis，就是这个axis要动起来(类似被for遍历)，其它的axis保持不动

Unnamed: 0,A,B,C,D,sum_value
0,0,1,2,3,6
1,4,5,6,7,22
2,8,9,10,11,38


## Pandas怎样实现DataFrame的Merge（十二）

In [15]:
import pandas as pd
df_ratings = pd.read_csv(
    r"D:\test1\ml-1m\ratings.dat", 
    sep="::",
    engine='python', 
    names="UserID::MovieID::Rating::Timestamp".split("::")
)
df_ratings.head()
#向上面数据一样取其他的列
df_users = pd.read_csv(
    r"D:\test1\ml-1m\users.dat", 
    sep="::",
    engine='python', 
    names="UserID::Gender::Age::Occupation::Zip-code".split("::")
)

df_movies = pd.read_csv(
    r"D:\test1\ml-1m\movies.dat", 
    sep="::",
    engine='python', 
    names="MovieID::Title::Genres".split("::")
)
#进行内连接合并，字段也会形成到一个表中
df_ratings_users = pd.merge(
   df_ratings, df_users, left_on="UserID", right_on="UserID", how="inner"
)

df_ratings_users_movies = pd.merge(
    df_ratings_users, df_movies, left_on="MovieID", right_on="MovieID", how="inner"
)


In [16]:
df_ratings_users

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code
0,1,1193,5,978300760,F,1,10,48067
1,1,661,3,978302109,F,1,10,48067
2,1,914,3,978301968,F,1,10,48067
3,1,3408,4,978300275,F,1,10,48067
4,1,2355,5,978824291,F,1,10,48067
...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,M,25,6,11106
1000205,6040,1094,5,956704887,M,25,6,11106
1000206,6040,562,5,956704746,M,25,6,11106
1000207,6040,1096,4,956715648,M,25,6,11106


In [17]:
df_ratings_users_movies

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western


In [18]:
left = pd.DataFrame({'sno': [11, 12, 13, 14],
                      'name': ['name_a', 'name_b', 'name_c', 'name_d']
                    })

right = pd.DataFrame({'sno': [11, 12, 13, 14],
                      'age': ['21', '22', '23', '24']
                    })
# 一对一关系，结果中有4条
pd.merge(left, right, on='sno')


Unnamed: 0,sno,name,age
0,11,name_a,21
1,12,name_b,22
2,13,name_c,23
3,14,name_d,24


In [19]:
left = pd.DataFrame({'sno': [11, 12, 13, 14],
                      'name': ['name_a', 'name_b', 'name_c', 'name_d']
                    })
                    
right = pd.DataFrame({'sno': [11, 11, 11, 12, 12, 13],
                       'grade': ['语文88', '数学90', '英语75','语文66', '数学55', '英语29']
                     })    
 # 数目以多的一边为准
pd.merge(left, right, on='sno')


Unnamed: 0,sno,name,grade
0,11,name_a,语文88
1,11,name_a,数学90
2,11,name_a,英语75
3,12,name_b,语文66
4,12,name_b,数学55
5,13,name_c,英语29


In [20]:
left = pd.DataFrame({'sno': [11, 11, 12, 12,12],
                      '爱好': ['篮球', '羽毛球', '乒乓球', '篮球', "足球"]
                    })

right = pd.DataFrame({'sno': [11, 11, 11, 12, 12, 13],
                       'grade': ['语文88', '数学90', '英语75','语文66', '数学55', '英语29']
                     })
                     
pd.merge(left, right, on='sno')


Unnamed: 0,sno,爱好,grade
0,11,篮球,语文88
1,11,篮球,数学90
2,11,篮球,英语75
3,11,羽毛球,语文88
4,11,羽毛球,数学90
5,11,羽毛球,英语75
6,12,乒乓球,语文66
7,12,乒乓球,数学55
8,12,篮球,语文66
9,12,篮球,数学55


In [1]:
# 2、获取Series的str属性，使用各种字符串处理函数
# 字符串替换函数
df["bWendu"].str.replace("℃", "")
# 判断是不是数字
df["bWendu"].str.isnumeric()
df["aqi"].str.len()

# 4、需要多次str处理的链式操作
# 怎样提取201803这样的数字月份？
# 1、先将日期2018-03-31替换成20180331的形式
2、提取月份字符串201803
df["ymd"].str.replace("-", "")
df["ymd"].str.replace("-", "").str.slice(0, 6)
# slice就是切片语法，可以直接用(同上操作)
df["ymd"].str.replace("-", "").str[0:6]


# 5、使用正则表达式的处理
# 添加新列
def get_nianyueri(x):
    year,month,day = x["ymd"].split("-")
    return f"{year}年{month}月{day}日"
df["中文日期"] = df.apply(get_nianyueri, axis=1)
df["中文日期"]

# 问题：怎样将“2018年12月31日”中的年、月、日三个中文字符去除？

# 方法1：链式replace
df["中文日期"].str.replace("年", "").str.replace("月","").str.replace("日", "")


# Series.str默认就开启了正则表达式模式

# 方法2：正则表达式替换
df["中文日期"].str.replace("[年月日]", "")

NameError: name 'df' is not defined

In [None]:
# Python处理Excel一列变多列（四十二）
# 2. 实现拆分

def split_func(line):
    line["姓名"], line["性别"], line["年龄"], line["城市"] = line["数据"].split(":")
    return line

df = df.apply(split_func, axis=1)
df.drop(["数据"], axis=1, inplace=True)

# 3. 输出到结果Excel

df.to_excel("./course_datas/c42_split_onecolumn_tomany/学生数据表_拆分后.xlsx", index=False)
1


In [27]:
import pandas as pd 


# 增加列头
column_names= ['id', 'name', 'age', 'weight','m0006','m0612','m1218','f0006','f0612','f1218']
df = pd.read_csv('patient_heart_rate.csv', names = column_names)
df

In [7]:
# 2. 一个列有多个参数

# 在数据中不难发现，Name 列包含了两个参数 Firtname 和 Lastname。为了达到数据整洁目的，我们决定将 name 列拆分成 Firstname 和 Lastname

# 从技术角度，我们可以使用 split 方法，完成拆分工作。

# 我们使用 str.split(expand=True),将列表拆成新的列，再将原来的 Name 列删除

# 切分名字，删除源数据列

df[['first_name','last_name']]=df['name'].str.split(expand=True)
df.drop('name',axis=1,inplace=True)

In [8]:
df

Unnamed: 0,id,age,weight,m0006,m0612,m1218,f0006,f0612,f1218,first_name,last_name
0,1.0,56.0,70kgs,72,69,71,-,-,-,Mickéy,Mousé
1,2.0,34.0,154.89lbs,-,-,-,85,84,76,Donald,Duck
2,3.0,16.0,,-,-,-,65,69,72,Mini,Mouse
3,4.0,,78kgs,78,79,72,-,-,-,Scrooge,McDuck
4,5.0,54.0,198.658lbs,-,-,-,69,,75,Pink,Panther
5,6.0,52.0,189lbs,-,-,-,68,75,72,Huey,McDuck
6,7.0,19.0,56kgs,-,-,-,71,78,75,Dewey,McDuck
7,8.0,32.0,78kgs,78,76,75,-,-,-,Scööpy,Doo
8,,,,,,,,,,,
9,9.0,52.0,189lbs,-,-,-,68,75,72,Huey,McDuck


In [10]:
#  3. 列数据的单位不统一

# 如果仔细观察数据集可以发现 Weight 列的单位不统一。有的单位是 kgs，有的单位是 lbs

# 获取 weight 数据列中单位为 lbs 的数据

rows_with_lbs=df['weight'].str.contains('lbs').fillna(False)
df[rows_with_lbs]

Unnamed: 0,id,age,weight,m0006,m0612,m1218,f0006,f0612,f1218,first_name,last_name
1,2.0,34.0,154.89lbs,-,-,-,85,84.0,76,Donald,Duck
4,5.0,54.0,198.658lbs,-,-,-,69,,75,Pink,Panther
5,6.0,52.0,189lbs,-,-,-,68,75.0,72,Huey,McDuck
9,9.0,52.0,189lbs,-,-,-,68,75.0,72,Huey,McDuck


In [14]:
#为了解决这个问题，将单位统一，我们将单位是 lbs 的数据转换成 kgs。

# 将 lbs 的数据转换为 kgs 数据

for i,lbs_row in df[rows_with_lbs].iterrows():
    weight = int(float(lbs_row['weight'][:-3])/2.2)
    df.at[i,'weight'] = '{}kgs'.format(weight) 
df

Unnamed: 0,id,age,weight,m0006,m0612,m1218,f0006,f0612,f1218,first_name,last_name
0,1.0,56.0,70kgs,72,69,71,-,-,-,Mickéy,Mousé
1,2.0,34.0,31kgs,-,-,-,85,84,76,Donald,Duck
2,3.0,16.0,,-,-,-,65,69,72,Mini,Mouse
3,4.0,,78kgs,78,79,72,-,-,-,Scrooge,McDuck
4,5.0,54.0,40kgs,-,-,-,69,,75,Pink,Panther
5,6.0,52.0,38kgs,-,-,-,68,75,72,Huey,McDuck
6,7.0,19.0,56kgs,-,-,-,71,78,75,Dewey,McDuck
7,8.0,32.0,78kgs,78,76,75,-,-,-,Scööpy,Doo
8,,,,,,,,,,,
9,9.0,52.0,38kgs,-,-,-,68,75,72,Huey,McDuck


In [15]:
# 1.缺失值
# 在数据中有NaN和“-”，代表缺失值，如何补足缺失值呢？

# 删除数据缺失的记录
# 使用当前列的均值
# 使用当前列出现频率最高的数据

# 2.空行
# 一整行都为空：直接删除全空的行
df.dropna(how='all',inplace=True)

In [16]:
df

Unnamed: 0,id,age,weight,m0006,m0612,m1218,f0006,f0612,f1218,first_name,last_name
0,1.0,56.0,70kgs,72,69,71,-,-,-,Mickéy,Mousé
1,2.0,34.0,31kgs,-,-,-,85,84,76,Donald,Duck
2,3.0,16.0,,-,-,-,65,69,72,Mini,Mouse
3,4.0,,78kgs,78,79,72,-,-,-,Scrooge,McDuck
4,5.0,54.0,40kgs,-,-,-,69,,75,Pink,Panther
5,6.0,52.0,38kgs,-,-,-,68,75,72,Huey,McDuck
6,7.0,19.0,56kgs,-,-,-,71,78,75,Dewey,McDuck
7,8.0,32.0,78kgs,78,76,75,-,-,-,Scööpy,Doo
9,9.0,52.0,38kgs,-,-,-,68,75,72,Huey,McDuck
10,10.0,12.0,45kgs,-,-,-,92,95,87,Louie,McDuck


In [18]:
# 4.非ASCII字符
# 直接删除非ASCII字符  正则表达式删除

df['first_name'].replace({r'[^\x00-\x7F]+':''},regex=True,inplace=True)

df['last_name'].replace({r'[^\x00-\x7F]+':''},regex=True,inplace=True)
df

Unnamed: 0,id,age,weight,m0006,m0612,m1218,f0006,f0612,f1218,first_name,last_name
0,1.0,56.0,70kgs,72,69,71,-,-,-,Micky,Mous
1,2.0,34.0,31kgs,-,-,-,85,84,76,Donald,Duck
2,3.0,16.0,,-,-,-,65,69,72,Mini,Mouse
3,4.0,,78kgs,78,79,72,-,-,-,Scrooge,McDuck
4,5.0,54.0,40kgs,-,-,-,69,,75,Pink,Panther
5,6.0,52.0,38kgs,-,-,-,68,75,72,Huey,McDuck
6,7.0,19.0,56kgs,-,-,-,71,78,75,Dewey,McDuck
7,8.0,32.0,78kgs,78,76,75,-,-,-,Scpy,Doo
9,9.0,52.0,38kgs,-,-,-,68,75,72,Huey,McDuck
10,10.0,12.0,45kgs,-,-,-,92,95,87,Louie,McDuck


In [22]:
# 首先我们校验一下是否存在重复记录。如果存在重复记录，就使用 Pandas 提供的 drop_duplicates() 来删除重复数据。

# 删除重复数据行
df.drop_duplicates(['first_name','last_name'],inplace=True)
df

Unnamed: 0,id,age,weight,m0006,m0612,m1218,f0006,f0612,f1218,first_name,last_name
0,1.0,56.0,70kgs,72,69,71,-,-,-,Micky,Mous
1,2.0,34.0,31kgs,-,-,-,85,84,76,Donald,Duck
2,3.0,16.0,,-,-,-,65,69,72,Mini,Mouse
3,4.0,,78kgs,78,79,72,-,-,-,Scrooge,McDuck
4,5.0,54.0,40kgs,-,-,-,69,,75,Pink,Panther
5,6.0,52.0,38kgs,-,-,-,68,75,72,Huey,McDuck
6,7.0,19.0,56kgs,-,-,-,71,78,75,Dewey,McDuck
7,8.0,32.0,78kgs,78,76,75,-,-,-,Scpy,Doo
10,10.0,12.0,45kgs,-,-,-,92,95,87,Louie,McDuck


In [None]:
url='https://github.com/MuseumofModernArt/collection/blob/master/Artworks.csv'
df1=pd.read_csv(url)
df1

In [30]:
# 基于pandas中expand的作用详解

# expand表示是否把series类型转化为DataFrame类型

# 下面代码中的ｎ表示去掉下划线"_"的数量

# 代码如下：

import numpy as np
import pandas as pd
s2 = pd.Series(['a_b_c_f_j', 'c_d_e_f_h', np.nan, 'f_g_h_x_g'])
print("-----------------------------------")
print(s2.str.split('_'))
print("-----------------------------------")
print(s2.str.split('_').str.get(1))
print("-----------------------------------")
print(s2.str.split('_').str[1])
print("---------------expand=True--------------------")
expand1=s2.str.split('_', expand=True)
print(expand1)
print(type(expand1))
print("---------------expand=False--------------------")
expand2=s2.str.split('_', expand=False)
print(expand2)
print(type(expand２))
print("##########################################################")
print("---------------expand=True,n=1--------------------")
expand1=s2.str.rsplit('_', expand=True,n=1)
print(expand1)
print("---------------expand=False,n=1--------------------")
expand2=s2.str.rsplit('_', expand=False,n=1)
print(expand2)
 


-----------------------------------
0    [a, b, c, f, j]
1    [c, d, e, f, h]
2                NaN
3    [f, g, h, x, g]
dtype: object
-----------------------------------
0      b
1      d
2    NaN
3      g
dtype: object
-----------------------------------
0      b
1      d
2    NaN
3      g
dtype: object
---------------expand=True--------------------
     0    1    2    3    4
0    a    b    c    f    j
1    c    d    e    f    h
2  NaN  NaN  NaN  NaN  NaN
3    f    g    h    x    g
<class 'pandas.core.frame.DataFrame'>
---------------expand=False--------------------
0    [a, b, c, f, j]
1    [c, d, e, f, h]
2                NaN
3    [f, g, h, x, g]
dtype: object
<class 'pandas.core.series.Series'>
##########################################################
---------------expand=True,n=1--------------------
         0    1
0  a_b_c_f    j
1  c_d_e_f    h
2      NaN  NaN
3  f_g_h_x    g
---------------expand=False,n=1--------------------
0    [a_b_c_f, j]
1    [c_d_e_f, h]
2             

In [None]:
# 步骤一：python导入Excel
import pandas as pd
from pandas import Series, DataFrame
os.chdir(r'd:\test1')
df = pd.read_excel('./datapratice.xlsx',engine='openpyxl')
df.to_excel('food.xlsx')
print(df)