In [11]:
test_df = pd.read_csv('test.csv')
ids = test_df['id'].copy()

ids

0         750000
1         750001
2         750002
3         750003
4         750004
           ...  
249995    999995
249996    999996
249997    999997
249998    999998
249999    999999
Name: id, Length: 250000, dtype: int64

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
import joblib

# 1. Load the Test Data
# Replace 'test_data.csv' with your test file path
test_df = pd.read_csv('test.csv')
print("Test data columns:", test_df.columns.tolist())
print("Test data shape:", test_df.shape)

# Check for 'id' column
if 'id' not in test_df.columns:
    print("Warning: 'id' column not found in test data. Generating synthetic IDs.")
    test_df['id'] = range(1, len(test_df) + 1)
else:
    print("'id' column found in test data.")

# Load training data for encoding and scaling
train_df = pd.read_csv('preprocessed_podcast_data.csv')
print("Training data columns:", train_df.columns.tolist())

# 2. Preprocess the Test Data
# Handle Missing Values
num_cols = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
            'Guest_Popularity_percentage', 'Number_of_Ads']
cat_cols = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 
            'Episode_Sentiment', 'Publication_Time']

# Numerical imputation
imputer_num = SimpleImputer(strategy='median')
test_df[num_cols] = imputer_num.fit_transform(test_df[num_cols])
train_df[num_cols] = imputer_num.fit_transform(train_df[num_cols])

# Categorical imputation
imputer_cat = SimpleImputer(strategy='constant', fill_value='Unknown')
test_df[cat_cols] = imputer_cat.fit_transform(test_df[cat_cols])
train_df[cat_cols] = imputer_cat.fit_transform(train_df[cat_cols])

# Data Type Conversion
for df in [train_df, test_df]:
    df['Episode_Length_minutes'] = df['Episode_Length_minutes'].astype(float)
    df['Host_Popularity_percentage'] = df['Host_Popularity_percentage'].astype(float)
    df['Guest_Popularity_percentage'] = df['Guest_Popularity_percentage'].astype(float)
    df['Number_of_Ads'] = df['Number_of_Ads'].astype(int)
    df['Publication_Day'] = df['Publication_Day'].astype('category')
    df['Genre'] = df['Genre'].astype('category')
    df['Episode_Sentiment'] = df['Episode_Sentiment'].astype('category')
    df['Publication_Time'] = df['Publication_Time'].astype('category')

# Outlier Capping
for df in [train_df, test_df]:
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

# Text Cleaning
for df in [train_df, test_df]:
    df['Podcast_Name'] = df['Podcast_Name'].str.lower().str.replace(r'[^a-z0-9\s]', '', regex=True)
    df['Episode_Title'] = df['Episode_Title'].str.lower().str.replace(r'[^a-z0-9\s]', '', regex=True)

# 3. Feature Engineering
# Categorical Encoding
cat_cols = ['Genre', 'Publication_Day', 'Episode_Sentiment', 'Publication_Time']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(train_df[cat_cols])  # Fit on training data
for df in [train_df, test_df]:
    ohe_features = ohe.transform(df[cat_cols])
    ohe_feature_names = ohe.get_feature_names_out(cat_cols)
    df_ohe = pd.DataFrame(ohe_features, columns=ohe_feature_names, index=df.index)
    df.drop(cat_cols, axis=1, inplace=True)
    df = pd.concat([df, df_ohe], axis=1)

# Target encode Podcast_Name
podcast_mean_listening = train_df.groupby('Podcast_Name')['Listening_Time_minutes'].mean()
for df in [train_df, test_df]:
    df['Podcast_Name_Encoded'] = df['Podcast_Name'].map(podcast_mean_listening)
    df['Podcast_Name_Encoded'] = df['Podcast_Name_Encoded'].fillna(podcast_mean_listening.mean())
    df.drop('Podcast_Name', axis=1, inplace=True)

# Text Features from Episode_Title
vectorizer = CountVectorizer(max_features=50, stop_words='english')
vectorizer.fit(train_df['Episode_Title'])  # Fit on training data
for df in [train_df, test_df]:
    df['Title_Length'] = df['Episode_Title'].apply(lambda x: len(x.split()))
    keywords = ['interview', 'exclusive', 'special', 'guest']
    for kw in keywords:
        df[f'Title_Has_{kw}'] = df['Episode_Title'].str.contains(kw, case=False, na=False).astype(int)
    title_bow = vectorizer.transform(df['Episode_Title'])
    bow_df = pd.DataFrame(title_bow.toarray(), 
                          columns=[f'Title_BOW_{f}' for f in vectorizer.get_feature_names_out()],
                          index=df.index)
    df.drop('Episode_Title', axis=1, inplace=True)
    df = pd.concat([df, bow_df], axis=1)

# Is_Weekend from encoded Publication_Day
for df in [train_df, test_df]:
    weekend_cols = [col for col in df.columns if 'Publication_Day_Saturday' in col or 'Publication_Day_Sunday' in col]
    df['Is_Weekend'] = df[weekend_cols].sum(axis=1).astype(int) if weekend_cols else 0

# Interaction Features
for df in [train_df, test_df]:
    df['Combined_Popularity'] = (df['Host_Popularity_percentage'] + df['Guest_Popularity_percentage']) / 2
    df['Ad_Density'] = df['Number_of_Ads'] / df['Episode_Length_minutes'].replace(0, 1)

# Scaling Numerical Features
num_cols = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
            'Guest_Popularity_percentage', 'Number_of_Ads', 
            'Podcast_Name_Encoded', 'Title_Length', 
            'Combined_Popularity', 'Ad_Density']
scaler = StandardScaler()
scaler.fit(train_df[num_cols])  # Fit on engineered training data
train_df[num_cols] = scaler.transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# Ensure test data has same columns as training data
engineered_train_df = pd.read_csv('engineered_podcast_data.csv')
train_cols = engineered_train_df.drop('Listening_Time_minutes', axis=1).columns
test_cols = test_df.columns
missing_cols = [col for col in train_cols if col not in test_cols and col != 'id']
for col in missing_cols:
    test_df[col] = 0  # Add missing columns with zeros
extra_cols = [col for col in test_cols if col not in train_cols and col != 'id']
test_df = test_df.drop(extra_cols, axis=1, errors='ignore')

# Reorder columns to match training data
test_df = test_df[train_cols]

# 4. Load the Model
model = joblib.load('LinearRegression_model.pkl')
print("Loaded Linear Regression model.")

# 5. Make Predictions
test_df['id'] = ids
# Store 'id' column separately to avoid dropping issues
test_ids = test_df['id']
test_features = test_df.drop('id', axis=1)
predictions = model.predict(test_features)

# 6. Save Output
output_df = pd.DataFrame({
    'id': test_ids,
    'Listening_Time_minutes': predictions
})
output_df['Listening_Time_minutes'] = output_df['Listening_Time_minutes'].clip(lower=0)  # Ensure non-negative predictions
output_df.to_csv('predictions.csv', index=False)
print("Predictions saved as 'predictions.csv'")
print("Output shape:", output_df.shape)
print("Sample predictions:\n", output_df.head())

Test data columns: ['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment']
Test data shape: (250000, 11)
'id' column found in test data.
Training data columns: ['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Listening_Time_minutes']
Loaded Linear Regression model.
Predictions saved as 'predictions.csv'
Output shape: (250000, 2)
Sample predictions:
        id  Listening_Time_minutes
0  750000                     0.0
1  750001                     0.0
2  750002                     0.0
3  750003                     0.0
4  750004                     0.0


In [13]:
test_features

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Genre_Business,Genre_Comedy,Genre_Education,Genre_Health,Genre_Lifestyle,Genre_Music,...,Title_BOW_84,Title_BOW_85,Title_BOW_86,Title_BOW_87,Title_BOW_88,Title_BOW_99,Title_BOW_episode,Is_Weekend,Combined_Popularity,Ad_Density
0,0.468860,-0.950895,0.032578,-0.313158,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.604057,-0.400594
1,-1.179431,0.499718,0.033753,-1.213222,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.354963,-0.646622
2,0.150752,0.351072,1.762608,-1.213222,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.531719,-0.646622
3,1.644183,-1.594009,-0.029293,0.586907,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-1.074471,-0.309914
4,0.254637,-0.076942,-1.613260,0.586907,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-1.240516,-0.109388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,-1.399461,0.258387,1.719142,1.486971,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.438445,2.121984
249996,0.679857,-0.803997,-0.860631,0.586907,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-1.165724,-0.192204
249997,-1.687888,-1.483836,0.829848,-0.313158,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.368128,0.957538
249998,1.581916,-0.716558,1.609106,1.486971,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.713332,-0.132969


In [16]:
print("Test data feature stats:\n", test_df.describe())
print("Missing values in test data:\n", test_df.isna().sum())

Test data feature stats:
        Episode_Length_minutes  Host_Popularity_percentage  \
count           250000.000000               250000.000000   
mean                 0.001758                   -0.006270   
std                  1.000240                    1.000304   
min                 -1.998899                   -2.508184   
25%                 -0.808412                   -0.901055   
50%                 -0.014755                    0.001753   
75%                  0.841492                    0.853846   
max                  3.316348                    2.531364   

       Guest_Popularity_percentage  Number_of_Ads  Genre_Business  \
count                250000.000000  250000.000000        250000.0   
mean                     -0.003026      -0.000580             0.0   
std                       0.999344       1.001514             0.0   
min                      -2.055753      -1.213222             0.0   
25%                      -0.702821      -1.213222             0.0   
50%       

In [15]:
model = joblib.load('LinearRegression_model.pkl')
print("Model coefficients:\n", model.coef_)
print("Model intercept:", model.intercept_)

Model coefficients:
 [ 2.38420138e+01 -3.24718189e+11 -3.62538473e+11 -2.62222893e+00
  7.96946757e+11  7.96946757e+11  7.96946757e+11  7.96946757e+11
  7.96946757e+11  7.96946757e+11  7.96946757e+11  7.96946757e+11
  7.96946757e+11  7.96946757e+11  1.98072601e+12  1.98072601e+12
  2.84126992e+12  2.84126992e+12  1.98072601e+12  1.98072601e+12
  1.98072601e+12  1.73778657e+12  1.73778657e+12  1.73778657e+12
  1.37492269e+12  1.37492269e+12  1.37492269e+12  1.37492269e+12
  3.22267391e-01  5.43287701e+12 -2.95850070e+12 -3.31142494e+12
 -3.93009261e+12  3.88447662e+12 -4.22363281e-01 -4.52026367e-01
 -1.41357422e-01  7.10449219e-01 -4.02832031e-01  2.52929688e-01
  4.00390625e-01 -2.38708496e-01 -3.15917969e-01  5.82763672e-01
  6.26098633e-01  1.51464844e+00  7.36816406e-01  5.28564453e-01
  4.61181641e-01  7.30957031e-01  8.52050781e-02 -5.66528320e-01
  1.19628906e-02 -1.90490723e-01 -5.68115234e-01 -3.47412109e-01
  6.51626587e-02 -6.42578125e-01  4.96093750e-01  2.08496094e-01
  8.

In [17]:
engineered_train_df = pd.read_csv('engineered_podcast_data.csv')
train_cols = engineered_train_df.drop('Listening_Time_minutes', axis=1).columns
print("Training columns:", train_cols.tolist())
print("Test columns:", test_df.columns.tolist())

Training columns: ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre_Business', 'Genre_Comedy', 'Genre_Education', 'Genre_Health', 'Genre_Lifestyle', 'Genre_Music', 'Genre_News', 'Genre_Sports', 'Genre_Technology', 'Genre_True Crime', 'Publication_Day_Friday', 'Publication_Day_Monday', 'Publication_Day_Saturday', 'Publication_Day_Sunday', 'Publication_Day_Thursday', 'Publication_Day_Tuesday', 'Publication_Day_Wednesday', 'Episode_Sentiment_Negative', 'Episode_Sentiment_Neutral', 'Episode_Sentiment_Positive', 'Publication_Time_Afternoon', 'Publication_Time_Evening', 'Publication_Time_Morning', 'Publication_Time_Night', 'Podcast_Name_Encoded', 'Title_Length', 'Title_Has_interview', 'Title_Has_exclusive', 'Title_Has_special', 'Title_Has_guest', 'Title_BOW_12', 'Title_BOW_18', 'Title_BOW_19', 'Title_BOW_20', 'Title_BOW_23', 'Title_BOW_24', 'Title_BOW_26', 'Title_BOW_27', 'Title_BOW_28', 'Title_BOW_29', 'Title_BOW_30', 'Title_BOW_

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
import joblib

# 1. Load the Test Data
# Replace 'test_data.csv' with your test file path
test_df = pd.read_csv('test.csv')
print("Test data columns:", test_df.columns.tolist())
print("Test data shape:", test_df.shape)
print("Test data sample:\n", test_df.head())
print("Test data stats:\n", test_df.describe())

# Check for 'id' column
if 'id' not in test_df.columns:
    print("Warning: 'id' column not found in test data. Generating synthetic IDs.")
    test_df['id'] = range(1, len(test_df) + 1)
else:
    print("'id' column found in test data.")

# Load training data for encoding and scaling
train_df = pd.read_csv('preprocessed_podcast_data.csv')
print("Training data columns:", train_df.columns.tolist())

# 2. Preprocess the Test Data
# Handle Missing Values
num_cols = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
            'Guest_Popularity_percentage', 'Number_of_Ads']
cat_cols = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 
            'Episode_Sentiment', 'Publication_Time']

# Numerical imputation
imputer_num = SimpleImputer(strategy='median')
test_df[num_cols] = imputer_num.fit_transform(test_df[num_cols])
train_df[num_cols] = imputer_num.fit_transform(train_df[num_cols])

# Categorical imputation
imputer_cat = SimpleImputer(strategy='constant', fill_value='Unknown')
test_df[cat_cols] = imputer_cat.fit_transform(test_df[cat_cols])
train_df[cat_cols] = imputer_cat.fit_transform(train_df[cat_cols])

# Data Type Conversion
for df in [train_df, test_df]:
    df['Episode_Length_minutes'] = df['Episode_Length_minutes'].astype(float)
    df['Host_Popularity_percentage'] = df['Host_Popularity_percentage'].astype(float)
    df['Guest_Popularity_percentage'] = df['Guest_Popularity_percentage'].astype(float)
    df['Number_of_Ads'] = df['Number_of_Ads'].astype(int)
    df['Publication_Day'] = df['Publication_Day'].astype('category')
    df['Genre'] = df['Genre'].astype('category')
    df['Episode_Sentiment'] = df['Episode_Sentiment'].astype('category')
    df['Publication_Time'] = df['Publication_Time'].astype('category')

# Outlier Capping
for df in [train_df, test_df]:
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

# Text Cleaning
for df in [train_df, test_df]:
    df['Podcast_Name'] = df['Podcast_Name'].str.lower().str.replace(r'[^a-z0-9\s]', '', regex=True)
    df['Episode_Title'] = df['Episode_Title'].str.lower().str.replace(r'[^a-z0-9\s]', '', regex=True)

# 3. Feature Engineering
# Categorical Encoding
cat_cols = ['Genre', 'Publication_Day', 'Episode_Sentiment', 'Publication_Time']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(train_df[cat_cols])  # Fit on training data
for df in [train_df, test_df]:
    ohe_features = ohe.transform(df[cat_cols])
    ohe_feature_names = ohe.get_feature_names_out(cat_cols)
    df_ohe = pd.DataFrame(ohe_features, columns=ohe_feature_names, index=df.index)
    df.drop(cat_cols, axis=1, inplace=True)
    df = pd.concat([df, df_ohe], axis=1)

# Target encode Podcast_Name
podcast_mean_listening = train_df.groupby('Podcast_Name')['Listening_Time_minutes'].mean()
for df in [train_df, test_df]:
    df['Podcast_Name_Encoded'] = df['Podcast_Name'].map(podcast_mean_listening)
    df['Podcast_Name_Encoded'] = df['Podcast_Name_Encoded'].fillna(podcast_mean_listening.mean())
    df.drop('Podcast_Name', axis=1, inplace=True)

# Text Features from Episode_Title
vectorizer = CountVectorizer(max_features=50, stop_words='english')
vectorizer.fit(train_df['Episode_Title'])  # Fit on training data
for df in [train_df, test_df]:
    df['Title_Length'] = df['Episode_Title'].apply(lambda x: len(x.split()))
    keywords = ['interview', 'exclusive', 'special', 'guest']
    for kw in keywords:
        df[f'Title_Has_{kw}'] = df['Episode_Title'].str.contains(kw, case=False, na=False).astype(int)
    title_bow = vectorizer.transform(df['Episode_Title'])
    bow_df = pd.DataFrame(title_bow.toarray(), 
                          columns=[f'Title_BOW_{f}' for f in vectorizer.get_feature_names_out()],
                          index=df.index)
    df.drop('Episode_Title', axis=1, inplace=True)
    df = pd.concat([df, bow_df], axis=1)

# Is_Weekend from encoded Publication_Day
for df in [train_df, test_df]:
    weekend_cols = [col for col in df.columns if 'Publication_Day_Saturday' in col or 'Publication_Day_Sunday' in col]
    df['Is_Weekend'] = df[weekend_cols].sum(axis=1).astype(int) if weekend_cols else 0

# Interaction Features
for df in [train_df, test_df]:
    df['Combined_Popularity'] = (df['Host_Popularity_percentage'] + df['Guest_Popularity_percentage']) / 2
    df['Ad_Density'] = df['Number_of_Ads'] / df['Episode_Length_minutes'].replace(0, 1)

# Scaling Numerical Features
num_cols = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
            'Guest_Popularity_percentage', 'Number_of_Ads', 
            'Podcast_Name_Encoded', 'Title_Length', 
            'Combined_Popularity', 'Ad_Density']
scaler = StandardScaler()
scaler.fit(train_df[num_cols])  # Fit on engineered training data
train_df[num_cols] = scaler.transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# Debugging: Inspect test data features
print("Test data feature stats after engineering:\n", test_df.drop('id', axis=1).describe())
print("Missing values in test data:\n", test_df.isna().sum())

# Ensure test data has same columns as training data
engineered_train_df = pd.read_csv('engineered_podcast_data.csv')
train_cols = engineered_train_df.drop('Listening_Time_minutes', axis=1).columns
test_cols = test_df.columns
missing_cols = [col for col in train_cols if col not in test_cols and col != 'id']
for col in missing_cols:
    test_df[col] = 0  # Add missing columns with zeros
extra_cols = [col for col in test_cols if col not in train_cols and col != 'id']
test_df = test_df.drop(extra_cols, axis=1, errors='ignore')

# Reorder columns to match training data
test_df = test_df[train_cols]

# Debugging: Compare columns
print("Training columns:", train_cols.tolist())
print("Test columns:", test_df.columns.tolist())

# 4. Load the Model
model = joblib.load('LinearRegression_model.pkl')
print("Loaded Linear Regression model.")
print("Model coefficients:\n", model.coef_)
print("Model intercept:", model.intercept_)

test_df['id'] = ids
# 5. Make Predictions
test_ids = test_df['id']
test_features = test_df.drop('id', axis=1)
predictions = model.predict(test_features)
print("Prediction stats:\n", pd.Series(predictions).describe())

# 6. Save Output
output_df = pd.DataFrame({
    'id': test_ids,
    'Listening_Time_minutes': predictions
})
output_df['Listening_Time_minutes'] = output_df['Listening_Time_minutes'].clip(lower=0, upper=119.97)  # Clip to training range
output_df.to_csv('predictions.csv', index=False)
print("Predictions saved as 'predictions.csv'")
print("Output shape:", output_df.shape)
print("Sample predictions:\n", output_df.head())

Test data columns: ['id', 'Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment']
Test data shape: (250000, 11)
Test data sample:
        id         Podcast_Name Episode_Title  Episode_Length_minutes  \
0  750000  Educational Nuggets    Episode 73                   78.96   
1  750001          Sound Waves    Episode 23                   27.87   
2  750002        Joke Junction    Episode 11                   69.10   
3  750003        Comedy Corner    Episode 73                  115.39   
4  750004         Life Lessons    Episode 50                   72.32   

       Genre  Host_Popularity_percentage Publication_Day Publication_Time  \
0  Education                       38.11        Saturday          Evening   
1      Music                       71.29          Sunday          Morning   
2     Comedy                       67.89          Frida