# Series 对象的创建
## 1.1 numpy 的 ndarray 转 Series 对象

In [1]:
# 导包
import numpy as np
import  pandas as pd

In [None]:
# 创建 ndarray 对象
arr = np.array([1,2,3,4])
print(arr)

In [None]:
# 将上述的 ndarray 对象，转为 Series
s1 = pd.Series(arr)
print(s1)

## 1.2 直接传入 Python 列表,构建 Series 对象

In [None]:
s2 = pd.Series(["张三","男",33])
print(s2 )

## 1.3传入 Python 列表,构建 Series 对象,并执行索引

In [2]:
s2 = pd.Series(["张三","男",33],index=["name",'sex','age'])
print(s2)

name    张三
sex      男
age     33
dtype: object


## 1.4 通过元组的方式创建

In [3]:
s2 = pd.Series(("张三","男",33),index=["name",'sex','age'])
print(s2)

name    张三
sex      男
age     33
dtype: object


## 1.5 通过字典的方式创建

In [4]:
s2 = pd.Series({"name":"张三","sex":"男","age":33})
print(s2)

name    张三
sex      男
age     33
dtype: object


# 2. 创建 DataFrame 对象
## 2.1 字典方式构建 df 对象

In [5]:
dict_data = {
    "id":[1,2,3],
    "name":["张","李","王"],
    "age":[20,20,20]
}
df1 = pd.DataFrame(dict_data) # 字典方式,每组键值对 = 1列数据
print(df1)

   id name  age
0   1    张   20
1   2    李   20
2   3    王   20


## 2.2 列表 + 元组方式构建df对象

In [None]:
list_data = [(1,"乔峰",33),(2,"虚竹",29),(3,"段誉",21)]
df2 = pd.DataFrame(list_data,index=["x","y","z"],columns=["id","name","age"]) # 以行的方式传入数据,columns是设置:列名
print(df2)

## Series 的常用属性方法
如下:
```python
loc             根据 索引行 获取某行数据
iloc            根据 行号 获取某行数据
dtype 或 dtypes  获取Series 元素类型
T               转置函数
shape           维度
size            大小(元素个数)
values          获取所有的值
index           获取所有的索引,功能类似于 keys() 方法
```

In [None]:
import os
os.getcwd()

In [None]:
# 读取 nobel_prizes.csv 文件的内容,获取 df 对象
df = pd.read_csv("data/nobel_prizes.csv",index_col="id") # index_col: 设置表中的某列为 索引列.
df.head() # 默认获取 5 条数据

In [None]:
# 2. 从上述的 df 对象中,获取第一行数据,即 Series对象
# first_row = df.loc[941]
# print(first_row)

# 使用 iloc 属性: 根据行号获取
first_row = df.iloc[0]
first_row

In [None]:
# 3. Series 对象属性
print(first_row.dtype) # 打印 Series 对象的元素类型.object(表示字符串)
print(first_row['year'].dtype) # int64
print(first_row['firstname'].dtype) # 报错 ,因为 str 属性中没有 dtype 属性

## Series 常用方法

In [None]:
# 1. 构建 Series 对象
s1 = pd.Series([1,2,3,4,5,6,6],index=['A','B','C','D','E',"F","G"])
print(s1)

In [None]:
# 2. 演示 Series 对象的 常用方法
# print(len(s1))            # 长度:6
# print(s1.size)            # 长度:6
# print(s1.head())          # 默认获取前 5 条
# print(s1.head(n=2))       # 指定获取前 2 条

# print(s1.tail())            # 默认获取后 5 条
# print(s1.tail(n=3))         # 指定,获取后3条

# print(s1.keys())            # 获取 Series 索引
# print(s1.index)             # 获取 Series 索引
#
# print(s1.tolist())         # 转列表
# print(s1.to_list())        # 转列表
# print(type(s1.tolist()))   # <class 'list'>
# print(s1.to_frame())       # 转成 DF 对象

# print(s1.describe())       # 查看 Series 的详细信息, 例如: 最大值\最小值\平均值\标准差等
# print(s1.max())
# print(s1.min())
# print(s1.mean())
# print(s1.std())             # 标准差

# print(s1.drop_duplicates())    # 去重,返回 Series 对象
# print(s1.unique())              # 去重,返回数组


# print(s1.sort_values())         # 根据 值 排序,默认:升序
# print(s1.sort_index(ascending=False))  # 根据 值 降序

# print(s1.sort_index())              # 根据 索引 排序,默认 升序
# print(s1.sort_index(ascending=False)) # 根据 索引 降序

# print(s1.value_counts())        # 统计每个值出现的次数


s1.hist()                   # 绘制 直方图



## 5. 电影数据案例

In [6]:
movie_df = pd.read_csv("data/movie.csv")
movie_df.head()


Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [8]:
# 2. 从 df 对象中,获取导演的名字
director = movie_df['director_name']
# print(director)

actor_1_fb_likes = movie_df['actor_1_facebook_likes']  # 获取主演的 facebook 点赞数
print(actor_1_fb_likes)


0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64


In [9]:
# 3. 统计不同导演执导的电影数量
# director.value_counts()
# 4. 统计主演各个点赞数,即 1000有几个 10000有几个
actor_1_fb_likes.value_counts()

actor_1_facebook_likes
1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
          ... 
703.0        1
208.0        1
79.0         1
269.0        1
291.0        1
Name: count, Length: 877, dtype: int64

In [10]:
# 5. 统计有多少空值
# director.count()  # 统计所有的非空值

# director.shape     # 总量

# 6. 打印描述信息
actor_1_fb_likes.describe()  # 查看描述信息,例如 最大值\最小值\平均值\标准差

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

## 6. Series 的布尔值的操作

In [7]:
# 1. 读取数据源,获取 df 对象
df = pd.read_csv("data/scientists.csv")
df

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [13]:
# 2. 手动传入布尔值,获取内容
bool_list = [True,True,True,True,False,False,False,False]
df[bool_list]

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist


In [8]:
# 3. 根据条件传入布尔值,筛选数据
# 需求: 删选出 年龄 大于 平均年龄的科学家
# 3.1 获取年龄列的数据
ages_series = df['Age']


# 3.2 计算平均年龄
avg_age = ages_series.mean()
print(avg_age)

# 3.3 判断当前年龄是否大于当前平均年龄；布尔列表
data = df[df["Age"] > avg_age ]
data

59.125


Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


## 7. Series 运算

In [14]:
# Series和数值变量的计算时, 变量会与 Series 中的每一个元素逐一进行计算
# 两个 Series 之间运算,如果 Series 元素个数相同,则两个Series对应的元素进行计算
# 元素个数不同的 Series 之间进行计算,会根据索引进行. 索引不同的元素最终计算的结果会填充成缺失值,用 NAN 表示
# Series 之间的计算,数据会尽可能依据索引标签进行相互计算
rev_series = ages_series.sort_values(ascending=False)
rev_series

2    90
7    77
3    66
1    61
4    56
5    45
6    41
0    37
Name: Age, dtype: int64

In [15]:
ages_series

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [12]:
rev_series + ages_series

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64