In [2]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm',
                               'Budapest_PaRis', 'Brussels_londOn'],
                   'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
                   'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
                   'Airline': ['KLM(!)', '<Air France> (12)', '(British Airways. )',
                               '12. Air France', '"Swiss Air"']})

In [3]:
df

Unnamed: 0,From_To,FlightNumber,RecentDelays,Airline
0,LoNDon_paris,10045.0,"[23, 47]",KLM(!)
1,MAdrid_miLAN,,[],<Air France> (12)
2,londON_StockhOlm,10065.0,"[24, 43, 87]",(British Airways. )
3,Budapest_PaRis,,[13],12. Air France
4,Brussels_londOn,10085.0,"[67, 32]","""Swiss Air"""


In [7]:
# 缺失值填充

df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)
df

Unnamed: 0,From_To,FlightNumber,RecentDelays,Airline
0,LoNDon_paris,10045,"[23, 47]",KLM(!)
1,MAdrid_miLAN,10055,[],<Air France> (12)
2,londON_StockhOlm,10065,"[24, 43, 87]",(British Airways. )
3,Budapest_PaRis,10075,[13],12. Air France
4,Brussels_londOn,10085,"[67, 32]","""Swiss Air"""


In [18]:
# 分割对象
temp = df['From_To'].str.split("_",expand = True)
temp.columns = ['From','To']
temp


Unnamed: 0,From,To
0,LoNDon,paris
1,MAdrid,miLAN
2,londON,StockhOlm
3,Budapest,PaRis
4,Brussels,londOn


In [27]:
# 只有首字母大写

temp['From'] = temp['From'].str.capitalize()
temp['To'] = temp['To'].str.capitalize()
temp
df

Unnamed: 0,From_To,FlightNumber,RecentDelays,Airline
0,LoNDon_paris,10045,"[23, 47]",KLM(!)
1,MAdrid_miLAN,10055,[],<Air France> (12)
2,londON_StockhOlm,10065,"[24, 43, 87]",(British Airways. )
3,Budapest_PaRis,10075,[13],12. Air France
4,Brussels_londOn,10085,"[67, 32]","""Swiss Air"""


* df.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')
# 参数说明：
* labels: 要删除的行或列的标签，可以是单个标签或列表。
* axis: 指定删除行还是列。axis=0 表示删除行，axis=1 表示删除列。
* index: 用于删除行的标签，等价于 labels 参数，axis=0。
* columns: 用于删除列的标签，等价于 labels 参数，axis=1。
* level: 如果是多层索引，指定要删除的索引级别。
* inplace: 是否在原 DataFrame 上进行修改。如果为 True，则会直接修改原 DataFrame；如果为 False，则返回一个新 DataFrame。
* errors: 如果设置为 'raise'，当标签不存在时会抛出错误；如果设置为 'ignore'，则忽略不存在的标签

In [30]:
df.drop('From_To', axis = 1,inplace = True)


KeyError: "['From_To'] not found in axis"

In [None]:
# 选择

df[['From','To']] = temp

列的选择和赋值：

1. df[['From', 'To']] 是一个选择 DataFrame 的列的操作，双中括号用于表示选择多个列，并返回一个新的 DataFrame，而不是一个 Series。
2. temp 是一个包含两列（From 和 To）的 DataFrame。因此，当我们使用 df[['From', 'To']] = temp 时，实际上是在 df 中创建或更新 From 和 To 这两列，并将 temp 中相应的列数据赋给它们。
保证数据对齐：

3. 使用 df[['From', 'To']] 可以确保 temp 中的列 From 和 To 能够正确对齐到 df 中的 From 和 To 列，且一次性完成赋值操作。如果你只使用一个中括号，比如 df['From', 'To'] = temp，会引发语法错误，因为这不是合法的 DataFrame 语法。
避免逐列赋值：

4. 如果你改为 df['From'] = temp['From'] 和 df['To'] = temp['To']，则需要分别进行两次赋值操作。而 df[['From', 'To']] = temp 可以在一次操作中同时赋值两列，代码更加简洁。

In [33]:
df

Unnamed: 0,FlightNumber,RecentDelays,Airline,From,To
0,10045,"[23, 47]",KLM(!),London,Paris
1,10055,[],<Air France> (12),Madrid,Milan
2,10065,"[24, 43, 87]",(British Airways. ),London,Stockholm
3,10075,[13],12. Air France,Budapest,Paris
4,10085,"[67, 32]","""Swiss Air""",Brussels,London


In [35]:
df = df._append([10086,[23],'aribus','london','paris'])

In [36]:
df


Unnamed: 0,FlightNumber,RecentDelays,Airline,From,To,0
0,10045.0,"[23, 47]",KLM(!),London,Paris,
1,10055.0,[],<Air France> (12),Madrid,Milan,
2,10065.0,"[24, 43, 87]",(British Airways. ),London,Stockholm,
3,10075.0,[13],12. Air France,Budapest,Paris,
4,10085.0,"[67, 32]","""Swiss Air""",Brussels,London,
0,,,,,,10086
1,,,,,,[23]
2,,,,,,aribus
3,,,,,,london
4,,,,,,paris


In [38]:
df.drop(index = 0)

Unnamed: 0,FlightNumber,RecentDelays,Airline,From,To,0
1,10055.0,[],<Air France> (12),Madrid,Milan,
2,10065.0,"[24, 43, 87]",(British Airways. ),London,Stockholm,
3,10075.0,[13],12. Air France,Budapest,Paris,
4,10085.0,"[67, 32]","""Swiss Air""",Brussels,London,
1,,,,,,[23]
2,,,,,,aribus
3,,,,,,london
4,,,,,,paris


In [41]:
df = df[df['FlightNumber != NaN']]

KeyError: 'FlightNumber != NaN'

In [42]:
# 去重空值

df = df[pd.notna(df['FlightNumber'])]

In [43]:
df

Unnamed: 0,FlightNumber,RecentDelays,Airline,From,To,0
0,10045.0,"[23, 47]",KLM(!),London,Paris,
1,10055.0,[],<Air France> (12),Madrid,Milan,
2,10065.0,"[24, 43, 87]",(British Airways. ),London,Stockholm,
3,10075.0,[13],12. Air France,Budapest,Paris,
4,10085.0,"[67, 32]","""Swiss Air""",Brussels,London,


In [45]:
df = df._append(df.loc[df['FlightNumber'] == 10085, :], ignore_index=True)
#df

Unnamed: 0,FlightNumber,RecentDelays,Airline,From,To,0
0,10045.0,"[23, 47]",KLM(!),London,Paris,
1,10055.0,[],<Air France> (12),Madrid,Milan,
2,10065.0,"[24, 43, 87]",(British Airways. ),London,Stockholm,
3,10075.0,[13],12. Air France,Budapest,Paris,
4,10085.0,"[67, 32]","""Swiss Air""",Brussels,London,
5,10085.0,"[67, 32]","""Swiss Air""",Brussels,London,
6,10085.0,"[67, 32]","""Swiss Air""",Brussels,London,
7,10085.0,"[67, 32]","""Swiss Air""",Brussels,London,


In [48]:
# 去重

# 将 'list_column' 转换为字符串或元组
df['list_column'] = df['list_column'].apply(lambda x: tuple(x) if isinstance(x, list) else x)

# 现在可以安全地使用 drop_duplicates()
df = df.drop_duplicates()


KeyError: 'list_column'