In [4]:
import pandas as pd
import requests
from zipfile import ZipFile
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# URL and filenames
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip'
zip_filename = 'student.zip'
extract_folder = 'student'

# Function to download and extract the dataset
def download_and_extract(url, zip_filename, extract_folder):
    # Download the zip file
    response = requests.get(url, stream=True)
    with open(zip_filename, 'wb') as f:
        f.write(response.content)

    # Extract the contents
    with ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

    print(f'Dataset extracted to folder: {extract_folder}')

# Function to load dataset into pandas DataFrame
def load_dataset(extract_folder):
    # List the files in the extracted folder
    extracted_files = os.listdir(extract_folder)
    print('Extracted Files:', extracted_files)

    # Load the dataset into pandas DataFrame
    mat_data = pd.read_csv(os.path.join(extract_folder, 'student-mat.csv'), sep=';')
    por_data = pd.read_csv(os.path.join(extract_folder, 'student-por.csv'), sep=';')

    return mat_data, por_data

# Main function to orchestrate the process
def main():
    # Download and extract the dataset
    download_and_extract(url, zip_filename, extract_folder)

    # Load the dataset (assuming 'student-mat.csv' is used for this example)
    mat_data, por_data = load_dataset(extract_folder)

    # Select relevant features and target variable
    features = ['Medu', 'studytime', 'absences', 'Dalc', 'Walc']
    X = mat_data[features]
    y = mat_data['G3']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate model performance
    mse = mean_squared_error(y_test, y_pred)
    print(f'\nMean Squared Error: {mse}')

    # Cross-validation for additional evaluation
    cv_scores = cross_val_score(model, X, y, cv=5)
    print(f'Cross-Validation Scores: {cv_scores}')
    print(f'Average Cross-Validation Score: {cv_scores.mean()}')

    # Example prediction
    sample_data = X_test.head(1)
    predicted_grade = model.predict(sample_data)[0]
    print(f'\nExample Prediction: Predicted Grade = {predicted_grade:.2f}')

    # Clean up: Delete the downloaded zip file after extraction
    os.remove(zip_filename)
    print(f'\nDeleted {zip_filename}')

if __name__ == '__main__':
    main()


Dataset extracted to folder: student
Extracted Files: ['student.txt', 'student-mat.csv', 'student-por.csv', 'student-merge.R']

Mean Squared Error: 20.7125800877253
Cross-Validation Scores: [-0.11252529 -0.01992873  0.02926078 -0.03513501  0.04773784]
Average Cross-Validation Score: -0.01811808371183197

Example Prediction: Predicted Grade = 9.01

Deleted student.zip
