## 创建Dataframe 

In [1]:
data = {"grammer":["Python","C","Java","GO",np.nan,"SQL","PHP","Python"],        
        "score":[1,2,np.nan,4,5,6,7,10]}

In [2]:
import numpy as np 
import pandas as pd 
df=pd.DataFrame(data)
df

Unnamed: 0,grammer,score
0,Python,1.0
1,C,2.0
2,Java,
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


R中没有字典概念,故直接创建 dataframe/ tibble

In [3]:
import rpy2.ipython
%load_ext rpy2.ipython

In [4]:
%%R
# 第一种
df1 <- data.frame( 
    "grammer" = c("Python","C","Java","GO",NA,"SQL","PHP","Python"),
    "score" = c(1,2,NA,4,5,6,7,10)
)
df1

  grammer score
1  Python     1
2       C     2
3    Java    NA
4      GO     4
5    <NA>     5
6     SQL     6
7     PHP     7
8  Python    10


In [5]:
%%R
# 第二种
library(tibble) 
df2 <- tibble( 
    "grammer" = c("Python","C","Java","GO",NA,"SQL","PHP","Python"),
    "score" = c(1,2,NA,4,5,6,7,10) 
)
df2

[90m# A tibble: 8 x 2[39m
  grammer score
  [3m[90m<chr>[39m[23m   [3m[90m<dbl>[39m[23m
[90m1[39m Python      1
[90m2[39m C           2
[90m3[39m Java       [31mNA[39m
[90m4[39m GO          4
[90m5[39m [31mNA[39m          5
[90m6[39m SQL         6
[90m7[39m PHP         7
[90m8[39m Python     10


也可以用 -ribble横向建tible

## 数据提取

In [6]:
df[df.grammer=='Python']  # Python 解法

Unnamed: 0,grammer,score
0,Python,1.0
7,Python,10.0


In [8]:
%%R -i df
df[which(df$grammer == 'Python'),] #R 解法

  grammer score
0  Python     1
7  Python    10


## 提取列名

In [9]:
df.columns # Python 解法

Index(['grammer', 'score'], dtype='object')

In [10]:
%R names(df) # R 解法

array(['grammer', 'score'], dtype='<U7')

## 修改列名

In [11]:
df.rename(columns={'score':'popularity'}, inplace = True) # Python 解法
df  

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [18]:
%R df <- df %>%   dplyr::rename(popularity=score)
df 

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,3.0
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


## 字符统计

In [13]:
df['grammer'].value_counts()

Python    2
C         1
Java      1
PHP       1
GO        1
SQL       1
Name: grammer, dtype: int64

In [14]:
%R table(df$grammer)

array([1, 1, 1, 1, 2, 1], dtype=int32)

## 缺失值处理

In [15]:
df['popularity'] = df['popularity'].fillna(df['popularity'].interpolate())
df 

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,3.0
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [19]:
%%R 
index <- which(is.na(df$popularity))
index
df$popularity <- Hmisc::impute(df$popularity,                        
                (unlist(df[index-1, 2] + df1[index+1, 2]))/2)
df

  grammer popularity
0  Python          1
1       C          2
2    Java          3
3      GO          4
4    <NA>          5
5     SQL          6
6     PHP          7
7  Python         10


## 数据提取:popularity中大于3的行

In [21]:
df[df.popularity>3]

Unnamed: 0,grammer,popularity
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [24]:
%R df$popu1arity

In [25]:
%%R 
# df[df$popu1arity > 3,] #这种方法跟 pandas很相似
df[(df$popularity > 3) & (df$popularity >3),] # ???

  grammer popularity
3      GO          4
4    <NA>          5
5     SQL          6
6     PHP          7
7  Python         10


In [26]:
%R df %>% dplyr::filter(df$popularity > 3.0)

Unnamed: 0,grammer,popularity
3,GO,4.0
4,NA_character_,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


## 数据去重

In [27]:
df.drop_duplicates(['grammer'])

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,3.0
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0


In [28]:
%R df[!duplicated(df$grammer),]

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,3.0
3,GO,4.0
4,NA_character_,5.0
5,SQL,6.0
6,PHP,7.0


## 计算popularity列的平均值

In [29]:
df['popularity'].mean()

4.75

In [30]:
%R mean(df$popularity)

array([4.75])

In [31]:
%R df %>%   dplyr::summarise(mean = mean(popularity))

Unnamed: 0,mean
1,4.75


## 将grammer列转为list

In [32]:
df['grammer'].to_list()

['Python', 'C', 'Java', 'GO', nan, 'SQL', 'PHP', 'Python']

In [33]:
%R unlist(df$grammer)

array(['Python', 'C', 'Java', 'GO', NA_character_, 'SQL', 'PHP', 'Python'],
      dtype=object)

## 将DataFrame导出为EXCEL

In [34]:
df.to_excel('./data/120ti.xlsx')

In [None]:
# R对EXCE文件不大友好第一种方法:利用 readr包转为csv再用 EXCEL打开文件本质依然是csv 

## 查看数据行列数

In [35]:
df.shape

(8, 2)

In [36]:
%R dim(df)

array([8, 2], dtype=int32)

## 提取popularity列大于3小于7的行

In [37]:
df[(df['popularity'] > 3) & (df['popularity'] < 7)]

Unnamed: 0,grammer,popularity
3,GO,4.0
4,,5.0
5,SQL,6.0


In [38]:
%R df[(df$popularity > 3) & (df$popularity <7),]

Unnamed: 0,grammer,popularity
3,GO,4.0
4,NA_character_,5.0
5,SQL,6.0


In [39]:
%R df %>% dplyr::filter(df$popularity > 3 & df$popularity <7)

Unnamed: 0,grammer,popularity
3,GO,4.0
4,NA_character_,5.0
5,SQL,6.0


## 交换两列位置

In [40]:
temp = df['popularity'] 
df.drop(labels=['popularity'], axis=1,inplace = True) 
df.insert(0, 'popularity', temp)
df 

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,
5,6.0,SQL
6,7.0,PHP
7,10.0,Python


In [42]:
%%R 
df <- df %>%     
dplyr::select(popularity,everything())
df 

  popularity grammer
0          1  Python
1          2       C
2          3    Java
3          4      GO
4          5    <NA>
5          6     SQL
6          7     PHP
7         10  Python


## 提取 popularity列最大值所在行

In [43]:
df[df['popularity'] == df['popularity'].max()]

Unnamed: 0,popularity,grammer
7,10.0,Python


In [44]:
%R df %>%   dplyr::filter(popularity == max(popularity))

Unnamed: 0,popularity,grammer
7,10.0,Python


In [45]:
%R df[df$popularity == max(df$popularity),]

Unnamed: 0,popularity,grammer
7,10.0,Python


## 查看最后5行

In [46]:
df.tail()

Unnamed: 0,popularity,grammer
3,4.0,GO
4,5.0,
5,6.0,SQL
6,7.0,PHP
7,10.0,Python


In [47]:
%R tail(df,5)

Unnamed: 0,popularity,grammer
3,4.0,GO
4,5.0,NA_character_
5,6.0,SQL
6,7.0,PHP
7,10.0,Python


## 删除最后一行

In [48]:
df=df[:-1]
df 

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,
5,6.0,SQL
6,7.0,PHP


In [50]:
%R df[-dim(df)[1],]

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,NA_character_
5,6.0,SQL
6,7.0,PHP


In [51]:
%R df %>%   dplyr::filter(rownames(df) != max(rownames(df)))

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,NA_character_
5,6.0,SQL
6,7.0,PHP


## 添加一行数据['Per1',6,6

In [56]:
row = {'grammer':'Perl','popularity':6.6}; 
df.append(row,ignore_index=True)

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,
5,6.0,SQL
6,7.0,PHP
7,6.6,Perl
8,6.6,Perl


In [57]:
%%R 
row <- data.frame("grammer" = c("Perl"), 
                  "popularity" = c(6.6) ) 
df <- rbind(df,row)
df 

   popularity grammer
0         1.0  Python
1         2.0       C
2         3.0    Java
3         4.0      GO
4         5.0    <NA>
5         6.0     SQL
6         7.0     PHP
7        10.0  Python
11        6.6    Perl
12        6.6    Perl


## 对数据按照" popularity"列值的大小进行排序

In [58]:
df.sort_values("popularity",inplace=True)
df 

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,
5,6.0,SQL
7,6.6,Perl
6,7.0,PHP


In [59]:
%R df <- df %>%   dplyr::arrange(popularity)

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,NA_character_
5,6.0,SQL
11,6.6,Perl
12,6.6,Perl
6,7.0,PHP
7,10.0,Python


## 统计 grammer列每个字符串的长度

In [61]:
df = df.fillna(method='ffill')

In [62]:
df.grammer.apply(lambda x: len(x))

0    6
1    1
2    4
3    2
4    2
5    3
7    4
6    3
Name: grammer, dtype: int64

In [63]:
%R df$grammer <- Hmisc::impute(df$grammer,'NAN');df$len_str <- stringr::str_length(df$grammer)

array([6, 1, 4, 2, 3, 3, 4, 4, 3, 6], dtype=int32)