## 创建Dataframe 

In [1]:
data = {"grammer":["Python","C","Java","GO",np.nan,"SQL","PHP","Python"],        
        "score":[1,2,np.nan,4,5,6,7,10]}

In [2]:
import numpy as np 
import pandas as pd 
df=pd.DataFrame(data)
df

Unnamed: 0,grammer,score
0,Python,1.0
1,C,2.0
2,Java,
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


R中没有字典概念,故直接创建 dataframe/ tibble

In [3]:
import rpy2.ipython
%load_ext rpy2.ipython

In [4]:
%%R
# 第一种
df1 <- data.frame( 
    "grammer" = c("Python","C","Java","GO",NA,"SQL","PHP","Python"),
    "score" = c(1,2,NA,4,5,6,7,10)
)
df1

  grammer score
1  Python     1
2       C     2
3    Java    NA
4      GO     4
5    <NA>     5
6     SQL     6
7     PHP     7
8  Python    10


In [5]:
%%R
# 第二种
library(tibble) 
df2 <- tibble( 
    "grammer" = c("Python","C","Java","GO",NA,"SQL","PHP","Python"),
    "score" = c(1,2,NA,4,5,6,7,10) 
)
df2

[90m# A tibble: 8 x 2[39m
  grammer score
  [3m[90m<chr>[39m[23m   [3m[90m<dbl>[39m[23m
[90m1[39m Python      1
[90m2[39m C           2
[90m3[39m Java       [31mNA[39m
[90m4[39m GO          4
[90m5[39m [31mNA[39m          5
[90m6[39m SQL         6
[90m7[39m PHP         7
[90m8[39m Python     10


**也可以用 -ribble横向建tible**

## 数据提取

In [6]:
df[df.grammer=='Python']  # Python 解法

Unnamed: 0,grammer,score
0,Python,1.0
7,Python,10.0


In [7]:
%%R -i df
df[which(df$grammer == 'Python'),] #R 解法

  grammer score
0  Python     1
7  Python    10


## 提取列名

In [8]:
df.columns # Python 解法

Index(['grammer', 'score'], dtype='object')

In [9]:
%R names(df) # R 解法

array(['grammer', 'score'], dtype='<U7')

## 修改列名

In [10]:
df.rename(columns={'score':'popularity'}, inplace = True) # Python 解法
df  

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [11]:
%R df <- df %>%   dplyr::rename(popularity=score) # 管道操作管道函数的作用，％>％来自dplyr包的管道函数，其作用是将前一步的结果直接传参给下一步的函数，从而省略了中间的赋值步骤，可以大量减少内存中的对象，节省内存。
df 

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


## 字符统计

In [12]:
df['grammer'].value_counts()

Python    2
Java      1
GO        1
C         1
PHP       1
SQL       1
Name: grammer, dtype: int64

In [13]:
%R table(df$grammer)

array([1, 1, 1, 1, 2, 1], dtype=int32)

## 缺失值处理

In [14]:
df['popularity'] = df['popularity'].fillna(df['popularity'].interpolate())
df 

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,3.0
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [15]:
%%R 
index <- which(is.na(df$popularity))
index
df$popularity <- Hmisc::impute(df$popularity,                        
                (unlist(df[index-1, 2] + df1[index+1, 2]))/2)
df

  grammer popularity
0  Python          1
1       C          2
2    Java          3
3      GO          4
4    <NA>          5
5     SQL          6
6     PHP          7
7  Python         10


## 数据提取:popularity中大于3的行

In [16]:
df[df.popularity>3]

Unnamed: 0,grammer,popularity
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [17]:
%R df$popu1arity

In [18]:
%%R 
# df[df$popu1arity > 3,] #这种方法跟 pandas很相似
df[(df$popularity > 3) & (df$popularity >3),] # ???

  grammer popularity
3      GO          4
4    <NA>          5
5     SQL          6
6     PHP          7
7  Python         10


In [19]:
%R df %>% dplyr::filter(df$popularity > 3.0)

Unnamed: 0,grammer,popularity
3,GO,4.0
4,NA_character_,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


## 数据去重

In [20]:
df.drop_duplicates(['grammer'])

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,3.0
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0


In [21]:
%R df[!duplicated(df$grammer),]

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,3.0
3,GO,4.0
4,NA_character_,5.0
5,SQL,6.0
6,PHP,7.0


## 计算popularity列的平均值

In [22]:
df['popularity'].mean()

4.75

In [23]:
%R mean(df$popularity)

array([4.75])

In [24]:
%R df %>%   dplyr::summarise(mean = mean(popularity))

Unnamed: 0,mean
1,4.75


## 将grammer列转为list

In [26]:
df['grammer'].to_list()

['Python', 'C', 'Java', 'GO', nan, 'SQL', 'PHP', 'Python']

In [27]:
%R unlist(df$grammer)

array(['Python', 'C', 'Java', 'GO', NA_character_, 'SQL', 'PHP', 'Python'],
      dtype=object)

## 将DataFrame导出为EXCEL

In [28]:
df.to_excel('../data/120题.xlsx')

In [None]:
# R对EXCE文件不大友好第一种方法:利用 readr包转为csv再用 EXCEL打开文件本质依然是csv 

## 查看数据行列数

In [29]:
df.shape

(8, 2)

In [30]:
%R dim(df)

array([8, 2], dtype=int32)

## 提取popularity列大于3小于7的行

In [31]:
df[(df['popularity'] > 3) & (df['popularity'] < 7)]

Unnamed: 0,grammer,popularity
3,GO,4.0
4,,5.0
5,SQL,6.0


In [32]:
%R df[(df$popularity > 3) & (df$popularity <7),]

Unnamed: 0,grammer,popularity
3,GO,4.0
4,NA_character_,5.0
5,SQL,6.0


In [33]:
%R df %>% dplyr::filter(df$popularity > 3 & df$popularity <7)

Unnamed: 0,grammer,popularity
3,GO,4.0
4,NA_character_,5.0
5,SQL,6.0


## 交换两列位置

In [34]:
temp = df['popularity'] 
df.drop(labels=['popularity'], axis=1,inplace = True) 
df.insert(0, 'popularity', temp) # 指定位置插入一列
df 

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,
5,6.0,SQL
6,7.0,PHP
7,10.0,Python


In [35]:
%%R 
df <- df %>%     
dplyr::select(popularity,everything()) #以指定列顺序显示
df 

  popularity grammer
0          1  Python
1          2       C
2          3    Java
3          4      GO
4          5    <NA>
5          6     SQL
6          7     PHP
7         10  Python


## 提取 popularity列最大值所在行

In [36]:
df[df['popularity'] == df['popularity'].max()]

Unnamed: 0,popularity,grammer
7,10.0,Python


In [37]:
%R df %>%   dplyr::filter(popularity == max(popularity))

Unnamed: 0,popularity,grammer
7,10.0,Python


In [38]:
%R df[df$popularity == max(df$popularity),]

Unnamed: 0,popularity,grammer
7,10.0,Python


## 查看最后5行

In [39]:
df.tail()

Unnamed: 0,popularity,grammer
3,4.0,GO
4,5.0,
5,6.0,SQL
6,7.0,PHP
7,10.0,Python


In [40]:
%R tail(df,5)

Unnamed: 0,popularity,grammer
3,4.0,GO
4,5.0,NA_character_
5,6.0,SQL
6,7.0,PHP
7,10.0,Python


## 删除最后一行

In [41]:
df=df[:-1]
df 

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,
5,6.0,SQL
6,7.0,PHP


In [42]:
%R df[-dim(df)[1],]

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,NA_character_
5,6.0,SQL
6,7.0,PHP


In [43]:
%R df %>%   dplyr::filter(rownames(df) != max(rownames(df)))

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,NA_character_
5,6.0,SQL
6,7.0,PHP


## 添加一行数据['Per1',6,6

In [44]:
row = {'grammer':'Perl','popularity':6.6}; 
df.append(row,ignore_index=True)

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,
5,6.0,SQL
6,7.0,PHP
7,6.6,Perl


In [45]:
%%R 
row <- data.frame("grammer" = c("Perl"), 
                  "popularity" = c(6.6) ) 
df <- rbind(df,row)
df 

   popularity grammer
0         1.0  Python
1         2.0       C
2         3.0    Java
3         4.0      GO
4         5.0    <NA>
5         6.0     SQL
6         7.0     PHP
7        10.0  Python
11        6.6    Perl


## 对数据按照" popularity"列值的大小进行排序

In [47]:
df.sort_values("popularity")
df 

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,
5,6.0,SQL
6,7.0,PHP


In [48]:
%R df <- df %>%   dplyr::arrange(popularity)

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,NA_character_
5,6.0,SQL
11,6.6,Perl
6,7.0,PHP
7,10.0,Python


## 统计 grammer列每个字符串的长度

In [49]:
df = df.fillna(method='ffill')

In [50]:
df.grammer.apply(lambda x: len(x))

0    6
1    1
2    4
3    2
4    2
5    3
6    3
Name: grammer, dtype: int64

In [63]:
%R df$grammer <- Hmisc::impute(df$grammer,'NAN');df$len_str <- stringr::str_length(df$grammer)

array([6, 1, 4, 2, 3, 3, 4, 4, 3, 6], dtype=int32)

## Excel 读取

In [95]:
df = pd.read_excel('../data/HSI_close.xlsx')
df

Unnamed: 0,Date,HSI
0,2018-01-02,30515.310547
1,2018-01-03,30560.949219
2,2018-01-04,30736.480469
3,2018-01-05,30814.640625
4,2018-01-08,30899.529297
...,...,...
487,2019-12-23,27906.410156
488,2019-12-24,27864.210938
489,2019-12-27,28225.419922
490,2019-12-30,28319.390625


R语言处理 excel不友好,直接读取日期时数据会変成实数乡 openx1sx::read。x1Sx中的 detectdates参数只能识别纯日期as.Data转换该列后时问数据丢失,只有日期故先把exce1文件转存为csv后用 readr包读取

In [84]:
%%R 
#该方法不理想
library(openxlsx) 
df <- read.xlsx("../data/HSI_close.xlsx",detectDates=T) 
df$Date <- as.Date(df$Date, origin="1900-01-01") 
head(df)

        Date      HSI
1 2018-01-04 30515.31
2 2018-01-05 30560.95
3 2018-01-06 30736.48
4 2018-01-07 30814.64
5 2018-01-10 30899.53
6 2018-01-11 31011.41


In [96]:
df.to_csv("../data/120.csv",index=False)

In [97]:
%%R 
# 转存csv后再读
library(readr)
df <- read_csv("../data/120.csv")

R[write to console]: Parsed with column specification:
cols(
  Date = [34mcol_date(format = "")[39m,
  HSI = [32mcol_double()[39m
)



In [99]:
%R head(df)

Unnamed: 0,Date,HSI
1,17533.0,30515.310547
2,17534.0,30560.949219
3,17535.0,30736.480469
4,17536.0,30814.640625
5,17539.0,30899.529297
6,17540.0,31011.410156


## 数据查看

In [93]:
df.head()

Unnamed: 0,Date,HSI
0,2018-01-02,30515.310547
1,2018-01-03,30560.949219
2,2018-01-04,30736.480469
3,2018-01-05,30814.640625
4,2018-01-08,30899.529297


In [100]:
%R head(df,8)

Unnamed: 0,Date,HSI
1,17533.0,30515.310547
2,17534.0,30560.949219
3,17535.0,30736.480469
4,17536.0,30814.640625
5,17539.0,30899.529297
6,17540.0,31011.410156
7,17541.0,31073.720703
8,17542.0,31120.390625


## 数据计算

将sa1ary列数据转換为最大值与最小值的平均值

In [159]:
df=pd.read_csv("../data/pandas120.csv")
df.head()

Unnamed: 0,createTime,education,salary
0,2020/3/16 10:58,本科,20k-40k
1,2020/3/16 10:46,不限,20k-35k
2,2020/3/16 10:45,本科,13k-20k
3,2020/3/16 10:20,本科,10k-20k
4,2020/3/16 10:33,本科,10k-18k


In [146]:
def tomean(s):
    a=s.split("-")
    mins=float(a[0].split("k")[0])
    maxs=float(a[1].split("k")[0])
    return str((mins+maxs)/2)+"k"

In [155]:
df.head()

Unnamed: 0,createTime,education,salary,salary_mean
0,2020/3/16 10:58,本科,20k-40k,30.0k
1,2020/3/16 10:46,不限,20k-35k,27.5k
2,2020/3/16 10:45,本科,13k-20k,16.5k
3,2020/3/16 10:20,本科,10k-20k,15.0k
4,2020/3/16 10:33,本科,10k-18k,14.0k


In [154]:
df['salary_mean']=df['salary'].apply(tomean)

In [160]:
# iterrows + re
import re 
for index,row in df.iterrows():     
    nums = re.findall('\d+',row[2])     
    df.iloc[index,2] = int(eval(f'({nums[0]} + {nums[1]}) / 2 * 1000'))

In [162]:
df.head() 

Unnamed: 0,createTime,education,salary
0,2020/3/16 10:58,本科,30000
1,2020/3/16 10:46,不限,27500
2,2020/3/16 10:45,本科,16500
3,2020/3/16 10:20,本科,15000
4,2020/3/16 10:33,本科,14000


In [164]:
%%R 
library(stringr) 
df$salary <- df$salary %>%   str_replace_all('k','') %>%   str_split('-',simplify = T) %>%   apply(2,as.numeric) %>%   rowMeans() * 1000

R[write to console]: Error in rowMeans(.) : 'x'必需是阵列，而且至少得有两个维度

R[write to console]: 此外: 

R[write to console]: 1: Missing column names filled in: 'X1' [1] 

R[write to console]: 2: Unknown or uninitialised column: `salary`. 




Error in rowMeans(.) : 'x'必需是阵列，而且至少得有两个维度


## 数据分组

In [171]:
df.groupby('education').mean()

Unnamed: 0_level_0,salary
education,Unnamed: 1_level_1
不限,17250.0
本科,21714.285714
硕士,17750.0


In [170]:
df.salary=df.salary.astype('int64')

In [172]:
%%R 
df %>%   group_by(education) %>%   summarise(mean = mean(salary))

R[write to console]: Error in group_by(., education) : 没有"group_by"这个函数




Error in group_by(., education) : 没有"group_by"这个函数
