In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import urllib.request

In [5]:

url_titanic = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
urllib.request.urlretrieve(url_titanic, 'titanic.csv')


In [15]:
data_titanic = pd.read_csv('titanic.csv')
print("泰坦尼克号数据集的列名：")
print(data_titanic.columns)
print("原始泰坦尼克号数据：")
print(data_titanic.head())

泰坦尼克号数据集的列名：
Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard',
       'Parents/Children Aboard', 'Fare'],
      dtype='object')
原始泰坦尼克号数据：
   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4  

In [17]:
# 处理缺失值：数值特征用均值填补，类别特征用众数填补
numeric_imputer = SimpleImputer(strategy='mean')
data_titanic[['Age', 'Fare']] = numeric_imputer.fit_transform(data_titanic[['Age', 'Fare']])



# 显示处理后的泰坦尼克号数据
print("处理后的泰坦尼克号数据：")
print(data_titanic.head())

# 保存处理后的泰坦尼克号数据
data_titanic.to_csv('processed_titanic.csv', index=False)

处理后的泰坦尼克号数据：
   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  


In [6]:
url_iris = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
urllib.request.urlretrieve(url_iris, 'iris.csv')
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

In [19]:

data_iris = pd.read_csv('iris.csv', names=column_names)

# 显示原始鸢尾花数据
print("原始鸢尾花数据：")
print(data_iris.head())

# 编码类别特征
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_categories = encoder.fit_transform(data_iris[['class']])
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(['class']))
data_iris = data_iris.join(encoded_df).drop(['class'], axis=1)

# 显示处理后的鸢尾花数据
print("处理后的鸢尾花数据：")
print(data_iris.head())

# 保存处理后的鸢尾花数据
data_iris.to_csv('processed_iris.csv', index=False)

原始鸢尾花数据：
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
处理后的鸢尾花数据：
   sepal_length  sepal_width  petal_length  petal_width  class_Iris-setosa  \
0           5.1          3.5           1.4          0.2                1.0   
1           4.9          3.0           1.4          0.2                1.0   
2           4.7          3.2           1.3          0.2                1.0   
3           4.6          3.1           1.5          0.2                1.0   
4           5.0          3.6           1.4          0.2                1.0   

   class_Iris-versicolor  class_Iris-virginica  
0                    0.0                   0.0  

In [8]:
url_boston = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
urllib.request.urlretrieve(url_boston, 'boston.csv')
column_names_boston = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 
    'PTRATIO', 'B', 'LSTAT', 'MEDV'
]

In [20]:

data_boston = pd.read_csv(url_boston, delim_whitespace=True, names=column_names_boston)

# 显示原始波士顿房价数据
print("原始波士顿房价数据：")
print(data_boston.head())

# 规范化数值特征
scaler = StandardScaler()
data_boston_scaled = pd.DataFrame(scaler.fit_transform(data_boston), columns=data_boston.columns)

# 显示处理后的波士顿房价数据
print("处理后的波士顿房价数据：")
print(data_boston_scaled.head())

# 保存处理后的波士顿房价数据
data_boston_scaled.to_csv('processed_boston.csv', index=False)

  data_boston = pd.read_csv(url_boston, delim_whitespace=True, names=column_names_boston)


原始波士顿房价数据：
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  
处理后的波士顿房价数据：
       CRIM        ZN     INDUS      CHAS       NOX        RM       AGE  \
0 -0.419782  0.284830 -1.287909 -0.272599 -0.144217  0.413672 -0.120013   
1 -0.417339 -0.487722 -0.593381 -0.272599 -0.740262  0.194274  0.367166   
2 -0.417342 -0.487722 -0.593381 -0.272599 -0.740262  1.282714 -0.265812   
3 -0.416750 -0.487722 -1.306878 -

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [4]:
url_boston = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
column_names_boston = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 
    'PTRATIO', 'B', 'LSTAT', 'MEDV'
]
data_boston = pd.read_csv(url_boston, delim_whitespace=True, names=column_names_boston)

# 标准化缩放
standard_scaler = StandardScaler()
data_boston_standard = pd.DataFrame(standard_scaler.fit_transform(data_boston), columns=data_boston.columns)

# 最小最大缩放
minmax_scaler = MinMaxScaler()
data_boston_minmax = pd.DataFrame(minmax_scaler.fit_transform(data_boston), columns=data_boston.columns)

# 稳健缩放
robust_scaler = RobustScaler()
data_boston_robust = pd.DataFrame(robust_scaler.fit_transform(data_boston), columns=data_boston.columns)

# 显示缩放后的数据
print("标准化缩放后的数据：")
print(data_boston_standard.head())
print("最小最大缩放后的数据：")
print(data_boston_minmax.head())
print("稳健缩放后的数据：")
print(data_boston_robust.head())

# 保存缩放后的数据
data_boston_standard.to_csv('boston_standard_scaled.csv', index=False)
data_boston_minmax.to_csv('boston_minmax_scaled.csv', index=False)
data_boston_robust.to_csv('boston_robust_scaled.csv', index=False)

  data_boston = pd.read_csv(url_boston, delim_whitespace=True, names=column_names_boston)


标准化缩放后的数据：
       CRIM        ZN     INDUS      CHAS       NOX        RM       AGE  \
0 -0.419782  0.284830 -1.287909 -0.272599 -0.144217  0.413672 -0.120013   
1 -0.417339 -0.487722 -0.593381 -0.272599 -0.740262  0.194274  0.367166   
2 -0.417342 -0.487722 -0.593381 -0.272599 -0.740262  1.282714 -0.265812   
3 -0.416750 -0.487722 -1.306878 -0.272599 -0.835284  1.016303 -0.809889   
4 -0.412482 -0.487722 -1.306878 -0.272599 -0.835284  1.228577 -0.511180   

        DIS       RAD       TAX   PTRATIO         B     LSTAT      MEDV  
0  0.140214 -0.982843 -0.666608 -1.459000  0.441052 -1.075562  0.159686  
1  0.557160 -0.867883 -0.987329 -0.303094  0.441052 -0.492439 -0.101524  
2  0.557160 -0.867883 -0.987329 -0.303094  0.396427 -1.208727  1.324247  
3  1.077737 -0.752922 -1.106115  0.113032  0.416163 -1.361517  1.182758  
4  1.077737 -0.752922 -1.106115  0.113032  0.441052 -1.026501  1.487503  
最小最大缩放后的数据：
       CRIM    ZN     INDUS  CHAS       NOX        RM       AGE       DIS  \
0  0.

In [5]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
import urllib.request

# 下载并加载泰坦尼克号数据集

data_titanic = pd.read_csv('titanic.csv')

# 检查数据集的列名
print("泰坦尼克号数据集的列名：")
print(data_titanic.columns)
print("原始泰坦尼克号数据：")
print(data_titanic.head())

# 定义异常值处理方法
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

def replace_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median = df[column].median()
    df.loc[(df[column] < lower_bound) | (df[column] > upper_bound), column] = median
    return df

# 删除异常值
data_titanic_no_outliers = remove_outliers(data_titanic, 'Fare')

# 替换异常值
data_titanic_replaced_outliers = replace_outliers(data_titanic.copy(), 'Fare')

# 显示处理后的数据
print("删除异常值后的泰坦尼克号数据：")
print(data_titanic_no_outliers.head())
print("替换异常值后的泰坦尼克号数据：")
print(data_titanic_replaced_outliers.head())

# 保存处理后的数据
data_titanic_no_outliers.to_csv('titanic_no_outliers.csv', index=False)
data_titanic_replaced_outliers.to_csv('titanic_replaced_outliers.csv', index=False)


泰坦尼克号数据集的列名：
Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard',
       'Parents/Children Aboard', 'Fare'],
      dtype='object')
原始泰坦尼克号数据：
   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0      NaN  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4  

In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
import urllib.request

# 下载并加载鸢尾花数据集

column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
data_iris = pd.read_csv('iris.csv', names=column_names)

# 显示原始鸢尾花数据
print("原始鸢尾花数据：")
print(data_iris.head())

# 假设非标准特征为“类”列，进行独热编码
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_categories = encoder.fit_transform(data_iris[['class']])
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(['class']))
data_iris = data_iris.join(encoded_df).drop(['class'], axis=1)

# 显示处理后的鸢尾花数据
print("处理后的鸢尾花数据：")
print(data_iris.head())

# 特征选择：过滤方法
X = data_iris.drop(columns=encoder.get_feature_names_out(['class']))
y = encoded_df.idxmax(axis=1)
select_k_best = SelectKBest(f_classif, k=2)
X_new = select_k_best.fit_transform(X, y)
print("过滤方法选择的特征：")
print(X.columns[select_k_best.get_support()])

# 特征选择：包裹方法
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2)
rfe.fit(X, y)
print("包裹方法选择的特征：")
print(X.columns[rfe.support_])

# 特征选择：嵌入方法
model = RandomForestClassifier()
model.fit(X, y)
importances = model.feature_importances_
indices = np.argsort(importances)[-2:]  # 选择最重要的两个特征
print("嵌入方法选择的特征：")
print(X.columns[indices])

# 保存处理后的鸢尾花数据
data_iris.to_csv('processed_iris.csv', index=False)



原始鸢尾花数据：
   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
处理后的鸢尾花数据：
   sepal_length  sepal_width  petal_length  petal_width  class_Iris-setosa  \
0           5.1          3.5           1.4          0.2                1.0   
1           4.9          3.0           1.4          0.2                1.0   
2           4.7          3.2           1.3          0.2                1.0   
3           4.6          3.1           1.5          0.2                1.0   
4           5.0          3.6           1.4          0.2                1.0   

   class_Iris-versicolor  class_Iris-virginica  
0                    0.0                   0.0  