In [5]:
import hashlib #通过计算文件SHA1，检验完整性
import os
import tarfile
import zipfile #解压缩
import requests#用于下载文件
import numpy as np
import pandas as pd
import torch
from torch import nn #神经网络模块
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split #用于训练集数据集划分
import torch.optim as optim #优化模块
from torch.utils.data import DataLoader, TensorDataset

#以下哈希验证是从李沐的书里学习的，重叠度比较高哈（）
DATA_HUB = dict() #创建字典
DATA_HUB = { #定义URL
    'national-survey-on-school-dropout': (
        'file://national-survey-on-school-dropout.zip',  # 使用file://协议指向本地文件None  # 没有SHA1哈希值设为None
        None
    )
}

def download(name, cache_dir=os.path.join('data')):
    #下载一个DATA_HUB中的文件，返回本地文件名
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    
    # 检查文件是否已存在且哈希匹配
    if os.path.exists(fname) and sha1_hash:
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f: #以二进制读取
            while True:
                data = f.read(1048576) #每次读1MB
                if not data: 
                    break #直至读完
                sha1.update(data) #更新哈希值
        if sha1.hexdigest() == sha1_hash:
            return fname  #文件完整，返回 
    
    print(f'正在从{url}下载{fname}...')
    
    # 处理本地文件
    if url.startswith('file://'):
        local_path = url[7:]  # 移除 file://得到实际路径
        print(f'正在从本地复制 {local_path} 到 {fname}...')
        
        # 确保源文件存在
        if not os.path.exists(local_path):
            raise FileNotFoundError(f"本地文件不存在: {local_path}")
            
        # 复制文件
        import shutil
        shutil.copy2(local_path, fname)
    else:
        # 处理网络URL，下载
        r = requests.get(url, stream=True, verify=True)
        with open(fname, 'wb') as f:
            f.write(r.content)
    
    return fname

#@save
def download_extract(name, folder=None):
    #下载并解压zip/tar文件
    fname = download(name)
    base_dir = os.path.dirname(fname) #创建一个base目录
    data_dir, ext = os.path.splitext(fname)
    
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False
    
    fp.extractall(base_dir) #解压到base目录
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():
    #下载DATA_HUB中的所有文件
    for name in DATA_HUB:
        download(name)
        
#接受内函数返回值，定义一个全局变量        
data_dir = download_extract('national-survey-on-school-dropout')

# 加载两个CSV文件
df_vivienda = pd.read_csv(os.path.join( data_dir,'df01_TipoVivienda.csv'))
df_enape = pd.read_csv(os.path.join(data_dir, 'df03_Dataset_ENAPE.csv'))

# 查看数据的前几行
print(df_vivienda.head())
print(df_enape.head())

#依据数据集中的 'folio'特征，对df_01 和df_03 进行融合
merged_df = pd.merge(df_vivienda, df_enape, on='FOLIO', how='inner')

#选择数据集中相关的特征用于训练
features = [
    'EDAD', 'nivel_edu', 'estres', 'depresion', 'desespero', 
    't_smartphone', 't_laptop', 't_PC', 't_tablet', 't_TV',
    'q_personas_x', 'q_hombres_x', 'q_mujeres_x', 'x<30_x', 'pc_x', 
    'laptop_x', 'tv_plana_x', 'tablet_x', 'smartphone_x','inscrito'
]
#定义terminado 为决定学生是否退学的目标特征
target = 'terminado'

#用所选特征创建最终的数据集
final_df = merged_df[features + [target]]
#删除数据集中所有target=2的行（我观察到数据有上万个，因此删除）
final_df = final_df[final_df[target] != 2]


#用同一列的均值替换数据中的"9"项
for column in final_df.columns:
    if final_df[column].dtype in [np.int64, np.float64] and column != target:
        #计算均值
        col_mean = final_df[final_df[column] != 9][column].mean()
        #将9替换为均值
        final_df[column] = final_df[column].replace(9, col_mean)
        



#分离目标和特征
X = final_df.drop(columns=[target]) #从final_df 只删除目标列
y = final_df[target] #只保留目标列

#使所有特征具有相同的尺度，使不同特征中0-9绝对数值的大小不会影响模型的拟合
#转化成正态分布
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#将数据集按 8：2比例分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,random_state=42)

# 将DataFrame转换为NumPy数组，再转成PyTorch张量
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

#创建DataLoader用于按批次加载数据，进行小批量训练
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

#设置MLP分类模型
class DropoutClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=64, output_size=2):
        super(DropoutClassifier, self).__init__()
        #设置了2个隐藏层，设置了它们的大小
        self.fc1= nn.Linear(input_size, hidden_size)
        #self.fc2= nn.Linear(hidden_size, hidden_size)
        self.fc2= nn.Linear(hidden_size, output_size)
        self.dropout= nn.Dropout(0.5)  # 暂退法防止过拟合
    #前向传播
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)  # 应用 dropout 
        #x = torch.relu(self.fc2(x)) #用了两个隐藏层 accuracy直接100%了，于是删了一个
        #x = self.dropout(x)  # 再用 dropout 
        x = self.fc2(x) 
        return x
        
#初始化模型
model = DropoutClassifier(input_size=X_train.shape[1])

#设置loss函数,优化（用adam自适应），学习率
criterion = nn.CrossEntropyLoss()  # 针对二分类
optimizer = optim.Adam(model.parameters(), lr=0.1)


#训练模块 （大框架不变）
def train(model, train_loader, test_loader, criterion, optimizer, epochs=50,lr=0.1):
    for epoch in range(epochs):
        model.train() #训练模式启动 
        running_loss = 0.0 #累计损失
        correct_preds = 0 #正确预测的个数
        total_preds = 0 #总预测的个数
        
        #用训练集评价模型
        for inputs, labels in train_loader:
            optimizer.zero_grad()  # 清空梯度
            outputs = model(inputs) #前向传播路径
            loss = criterion(outputs, labels)
            loss.backward()  # 反向传播更新梯度
            optimizer.step()  
            
            # 计算准确率
            _, predicted = torch.max(outputs, 1)#筛选“概率”最大值来预测
            correct_preds += (predicted == labels).sum().item()
            #正确预测个数（True为1，False为0）累加即可求得
            total_preds += labels.size(0)#预测总数
            running_loss += loss.item() 
        #计算 
        train_accuracy = 100 * correct_preds / total_preds
        avg_train_loss = running_loss / len(train_loader)

        #用测试集评价模型（和上一个逻辑相同）
        model.eval()  #启动锐评模式
        correct_preds = 0
        total_preds = 0
        with torch.no_grad(): #不要再更新梯度了
            for inputs, labels in test_loader:
                outputs = model(inputs)
                _, predicted = torch.max(outputs, 1)
                correct_preds += (predicted == labels).sum().item()
                total_preds += labels.size(0)

        test_accuracy = 100 * correct_preds / total_preds

        #每10个循环一打印
        if epoch % 10 == 0 or epoch == epochs - 1:
            print(f'Epoch [{epoch+1},Train Loss: {avg_train_loss:.4f},Train Accuracy: {train_accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}%')

train(model, train_loader, test_loader, criterion, optimizer, epochs=20)




正在从file://national-survey-on-school-dropout.zip下载data\national-survey-on-school-dropout.zip...
正在从本地复制 national-survey-on-school-dropout.zip 到 data\national-survey-on-school-dropout.zip...
   FOLIO  q_personas  q_hombres  q_mujeres  x<30   pc  laptop  tv_plana  \
0      1           1          1          0     2  0.0     0.0       0.0   
1      2           4          2          2     1  1.0     1.0       1.0   
2      3           2          0          2     2  0.0     0.0       0.0   
3      4           1          0          1     2  0.0     0.0       0.0   
4      5           2          1          1     2  0.0     0.0       0.0   

   tablet  smartphone  InternetF  sIF_causa  mejora_vida  mejora_trabajo  \
0     0.0         0.0        0.0        0.0          0.0             0.0   
1     1.0         1.0        1.0        0.0          1.0             1.0   
2     0.0         0.0        0.0        0.0          0.0             0.0   
3     0.0         0.0        0.0        0.0          0.0