In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from fuzzywuzzy import fuzz
import logging
import re
import os
from urllib.parse import urlparse
import requests

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def is_valid_url(url):
    return isinstance(url, str) and url.startswith(('http://', 'https://'))

def fuzzy_url_match(url1, url2, threshold=80):
    return fuzz.ratio(url1, url2) >= threshold

def get_filename_without_extension(url):
    return os.path.splitext(os.path.basename(url))[0]

def is_filename_consistent(preview_url, full_url):
    preview_filename = get_filename_without_extension(preview_url).replace('_PREVIEW', '')
    full_filename = get_filename_without_extension(full_url)
    return preview_filename == full_filename

def mp4_url_exists(url):
    if pd.isna(url):
        logger.warning(f"Invalid URL (NaN value)")
        return False
    try:
        response = requests.head(url, timeout=5, allow_redirects=True)
        
        if response.status_code == 200:
            content_type = response.headers.get('Content-Type', '').lower()
            
            valid_types = ['video/mp4', 'application/octet-stream', 'binary/octet-stream']
            if any(t in content_type for t in valid_types):
                return True
            else:
                logger.warning(f"URL exists but content type is {content_type}: {url}")
                return False
        elif response.status_code == 404:
            logger.info(f"File not found (404): {url}")
            return False
        else:
            logger.warning(f"URL returned status code {response.status_code}: {url}")
            return False
    except requests.RequestException as e:
        logger.error(f"Error checking URL: {e}: {url}")
        return False
    except Exception as e:
        logger.error(f"Unexpected error checking URL: {e}: {url}")
        return False

def extract_url_features(url):
    if pd.isna(url):
        return {
            'scheme': '',
            'netloc': '',
            'path': '',
            'params': '',
            'query': '',
            'fragment': '',
            'path_length': 0,
            'num_directories': 0,
            'file_extension': '',
        }
    parsed = urlparse(url)
    return {
        'scheme': parsed.scheme,
        'netloc': parsed.netloc,
        'path': parsed.path,
        'params': parsed.params,
        'query': parsed.query,
        'fragment': parsed.fragment,
        'path_length': len(parsed.path),
        'num_directories': len([x for x in parsed.path.split('/') if x]),
        'file_extension': os.path.splitext(parsed.path)[-1],
    }

def get_filename_without_preview(url):
    if pd.isna(url):
        return ''
    filename = os.path.basename(url)
    return re.sub(r'(_PREVIEW|_preview|sample_|_sample)', '', filename)

def simple_inference(url):
    if pd.isna(url):
        return ''
    parts = url.split('/')
    filename = get_filename_without_preview(parts[-1])
    return '/'.join(parts[:-1] + [filename])

class URLRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn = KNeighborsRegressor(n_neighbors=n_neighbors, metric='cosine')
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.n_neighbors = min(self.n_neighbors, len(y))  # Ensure n_neighbors doesn't exceed sample size
        self.knn = KNeighborsRegressor(n_neighbors=self.n_neighbors, metric='cosine')
        self.knn.fit(X, range(len(y)))
        return self
    
    def predict(self, X):
        try:
            indices = self.knn.predict(X)
            return [self.y_train[min(int(i), len(self.y_train) - 1)] for i in indices]
        except Exception as e:
            logger.error(f"Error in URLRegressor predict: {e}")
            return ['' for _ in range(len(X))]  # Return empty strings if prediction fails

def prepare_data(df):
    # Extract URL features
    url_features = df['Video_URL'].apply(extract_url_features)
    
    # Convert the list of dictionaries to a DataFrame
    url_features_df = pd.DataFrame(url_features.tolist(), index=df.index)
    
    # Concatenate the original DataFrame with the new features
    X = pd.concat([df[['Video_URL']], url_features_df], axis=1)
    y = df['New_URL']
    
    # Handle NaN values
    X = X.fillna('')
    y = y.fillna('')
    
    return X, y

class KNNURLRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn = KNeighborsRegressor(n_neighbors=n_neighbors, metric='cosine')
        self.label_encoder = LabelEncoder()
        
    def fit(self, X, y):
        self.X_train = X.toarray() if hasattr(X, 'toarray') else X
        self.y_train = y
        self.y_encoded = self.label_encoder.fit_transform(y)
        self.n_neighbors = min(self.n_neighbors, len(y))
        self.knn.fit(self.X_train, self.y_encoded)
        return self
    
    def predict(self, X):
        try:
            X = X.toarray() if hasattr(X, 'toarray') else X
            distances, indices = self.knn.kneighbors(X)
            predictions = []
            for i, row in enumerate(X):
                for idx in indices[i]:
                    predicted_url = self.y_train[idx]
                    if is_filename_consistent(self.X_train[i]['Video_URL'], predicted_url):
                        predictions.append(predicted_url)
                        break
                else:
                    predictions.append('')  # No consistent match found
            return predictions
        except Exception as e:
            logger.error(f"Error in KNNURLRegressor predict: {e}")
            return ['' for _ in range(len(X))]

class XGBoostURLRegressor:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.model = xgb.XGBClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            objective='multi:softprob'
        )
        
        # Define preprocessing steps
        categorical_features = ['scheme', 'netloc', 'file_extension']
        numeric_features = ['path_length', 'num_directories']
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])
        
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])
        
        self.label_encoder = LabelEncoder()
        
    def fit(self, X, y):
        logger.info(f"Fitting XGBoostURLRegressor with {len(X)} samples")
        logger.info(f"X columns: {X.columns}")
        logger.info(f"X dtypes: {X.dtypes}")
        
        # Ensure X doesn't contain the 'Video_URL' column
        if 'Video_URL' in X.columns:
            X = X.drop('Video_URL', axis=1)
        
        # Preprocess the data
        X_preprocessed = self.preprocessor.fit_transform(X)
        
        # Encode target variable
        y_encoded = self.label_encoder.fit_transform(y)
        
        logger.info(f"Preprocessed X shape: {X_preprocessed.shape}")
        logger.info(f"y_encoded shape: {y_encoded.shape}")
        
        # Fit the model
        self.model.fit(X_preprocessed, y_encoded)
        
        return self
    
    def predict(self, X):
        logger.info(f"Predicting with XGBoostURLRegressor for {len(X)} samples")
        logger.info(f"X columns: {X.columns}")
        logger.info(f"X dtypes: {X.dtypes}")
        
        try:
            # Ensure X doesn't contain the 'Video_URL' column
            if 'Video_URL' in X.columns:
                X = X.drop('Video_URL', axis=1)
            
            # Preprocess the data
            X_preprocessed = self.preprocessor.transform(X)
            
            logger.info(f"Preprocessed X shape for prediction: {X_preprocessed.shape}")
            
            # Make predictions
            y_encoded_pred = self.model.predict(X_preprocessed)
            predictions = self.label_encoder.inverse_transform(y_encoded_pred)
            
            logger.info(f"Made {len(predictions)} predictions")
            
            return predictions
        except Exception as e:
            logger.error(f"Error in XGBoostURLRegressor predict: {e}")
            return np.array([''] * len(X))

class ImprovedURLRegressor:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn = KNeighborsRegressor(n_neighbors=n_neighbors, metric='cosine')
        
    def fit(self, X, y):
        logger.info(f"Fitting ImprovedURLRegressor. Input shapes - X: {X.shape}, y: {y.shape}")
        valid_mask = X['Video_URL'].apply(is_valid_url)
        self.X_train = X[valid_mask]
        self.y_train = y[valid_mask]
        logger.info(f"After filtering invalid URLs - X_train: {self.X_train.shape}, y_train: {self.y_train.shape}")
        
        self.n_neighbors = min(self.n_neighbors, len(self.y_train))
        self.knn.fit(self.X_train.drop('Video_URL', axis=1), range(len(self.y_train)))
        return self
    
    def predict(self, X):
        logger.info(f"Predicting with ImprovedURLRegressor. Input shape - X: {X.shape}")
        try:
            valid_mask = X['Video_URL'].apply(is_valid_url)
            X_valid = X[valid_mask]
            logger.info(f"Valid prediction data shape: {X_valid.shape}")
            
            distances, indices = self.knn.kneighbors(X_valid.drop('Video_URL', axis=1))
            logger.info(f"KNN prediction indices shape: {indices.shape}")
            
            predictions = []
            for i, row in X_valid.iterrows():
                for idx in indices[i]:
                    predicted_url = self.y_train.iloc[idx]
                    if is_filename_consistent(row['Video_URL'], predicted_url) and fuzzy_url_match(row['Video_URL'], predicted_url):
                        predictions.append(predicted_url)
                        break
                else:
                    predictions.append('')
            
            logger.info(f"Number of predictions made: {len(predictions)}")
            
            result = pd.Series([''] * len(X), index=X.index)
            result[X_valid.index] = predictions
            return result
        except Exception as e:
            logger.error(f"Error in ImprovedURLRegressor predict: {e}")
            return pd.Series([''] * len(X), index=X.index)

def train_and_predict_iteratively(csv_path, start_iteration=0):
    try:
        df = pd.read_csv(csv_path)
        logger.info(f"Loaded {len(df)} rows from CSV")
        
        # Initialize sets for known and unknown pairs
        known_pairs = df[df['Works'] == True].copy()
        unknown_pairs = df[df['Works'].isin([False, np.nan])].copy()
        
        logger.info(f"Initial known pairs: {len(known_pairs)}, unknown pairs: {len(unknown_pairs)}")
        
        # Iterative prediction process
        for iteration, method in enumerate(['simple', 'knn', 'xgboost'][start_iteration:], start=start_iteration):
            logger.info(f"Starting iteration {iteration + 1} with method: {method}")
            
            if len(unknown_pairs) == 0:
                logger.info("All pairs resolved. Stopping iterations.")
                break
            
            # Prepare data
            X_known, y_known = prepare_data(known_pairs)
            X_unknown, _ = prepare_data(unknown_pairs)
            
            logger.info(f"Prepared data - X_known: {X_known.shape}, X_unknown: {X_unknown.shape}")
            logger.info(f"X_known dtypes: {X_known.dtypes}")
            logger.info(f"y_known dtype: {y_known.dtype}")
            
            # Predict using the current method
            if method == 'simple':
                null_works = unknown_pairs['Works'].isnull()
                predictions = unknown_pairs.loc[null_works, 'Video_URL'].apply(simple_inference)
                unknown_pairs.loc[null_works, 'Predicted_New_URL'] = predictions
            elif method == 'knn':
                model = KNNURLRegressor(n_neighbors=5)
                model.fit(X_known, y_known)
                predictions = model.predict(X_unknown)
                unknown_pairs['Predicted_New_URL'] = predictions
            elif method == 'xgboost':
                model = XGBoostURLRegressor()
                model.fit(X_known.drop('Video_URL', axis=1), y_known)
                predictions = model.predict(X_unknown.drop('Video_URL', axis=1))
                unknown_pairs['Predicted_New_URL'] = predictions
            
            logger.info(f"Made {len(predictions)} predictions")
            
            # Validate predictions
            unknown_pairs['Prediction_Works'] = unknown_pairs.apply(lambda row: 
                mp4_url_exists(row['Predicted_New_URL']) and 
                is_filename_consistent(row['Video_URL'], row['Predicted_New_URL']) and
                fuzzy_url_match(row['Video_URL'], row['Predicted_New_URL']), axis=1)
            
            # Move successful predictions to known pairs
            successful_predictions = unknown_pairs[unknown_pairs['Prediction_Works'] == True]
            known_pairs = pd.concat([known_pairs, successful_predictions])
            unknown_pairs = unknown_pairs[unknown_pairs['Prediction_Works'] == False]
            
            logger.info(f"Iteration {iteration + 1} complete. "
                        f"Successful predictions: {len(successful_predictions)}, "
                        f"Remaining unknown pairs: {len(unknown_pairs)}")
            
            # Write results after each iteration
            all_pairs = pd.concat([known_pairs, unknown_pairs])
            all_pairs['Final_URL'] = all_pairs.apply(lambda row: row['New_URL'] if row['Works'] == True else row['Predicted_New_URL'], axis=1)
            all_pairs.to_csv(f'iterative_url_predictions_iter_{iteration+1}.csv', index=False)
            logger.info(f"Results saved to 'iterative_url_predictions_iter_{iteration+1}.csv'")
        
        return all_pairs
    
    except Exception as e:
        logger.error(f"Error in train_and_predict_iteratively: {e}")
        return None


# Usage
# csv_path = 'db.csv'
# results = train_and_predict_iteratively(csv_path)
# csv_path = 'iterative_url_predictions_iter_1.csv'
csv_path = 'iterative_url_predictions_iter_3.csv'
results = train_and_predict_iteratively(csv_path, start_iteration=2)  # Start from XGBoost

if results is not None:
    print(f"Total pairs: {len(results)}")
    print(f"Successfully predicted pairs: {results['Prediction_Works'].sum()}")
    print(f"Remaining unknown pairs: {len(results[results['Works'].isin([False, np.nan])])}")
else:
    print("Prediction process failed. Please check the logs for more information.")

In [None]:
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def analyze_xgboost_predictions(csv_path):
    # Read the CSV file
    df = pd.read_csv(csv_path)
    logger.info(f"Loaded {len(df)} rows from {csv_path}")

    # Filter for successful predictions
    successful_predictions = df[df['Prediction_Works'] == True]
    logger.info(f"Found {len(successful_predictions)} successful predictions")

    if len(successful_predictions) > 0:
        logger.info("Details of successful predictions:")
        for index, row in successful_predictions.iterrows():
            logger.info(f"Index: {index}")
            logger.info(f"Video URL: {row['Video_URL']}")
            logger.info(f"Predicted New URL: {row['Predicted_New_URL']}")
            logger.info(f"Original New URL: {row['New_URL']}")
            logger.info("---")
    else:
        logger.info("No successful predictions found.")

    # Additional analysis
    logger.info(f"Total pairs: {len(df)}")
    logger.info(f"Known pairs (Works == True): {len(df[df['Works'] == True])}")
    logger.info(f"Unknown pairs (Works == False or NaN): {len(df[df['Works'].isin([False, pd.NA])])}")

if __name__ == "__main__":
    csv_path = 'iterative_url_predictions_iter_3.csv'
    analyze_xgboost_predictions(csv_path)

In [None]:
import pandas as pd

def consolidate_results(original_csv, ml_inferred_csv, output_csv):
    # Load the original and ML-inferred datasets
    original_df = pd.read_csv(original_csv)
    ml_inferred_df = pd.read_csv(ml_inferred_csv)
    
    # Merge the datasets based on the Video_URL
    merged_df = pd.merge(original_df, ml_inferred_df[['Video_URL', 'ML_Predicted_New_URL', 'ML_Prediction_Works']], 
                         on='Video_URL', how='left')
    
    # Update the New_URL and Works columns where ML prediction worked
    mask = (merged_df['Works'] == False) & (merged_df['ML_Prediction_Works'] == True)
    merged_df.loc[mask, 'New_URL'] = merged_df.loc[mask, 'ML_Predicted_New_URL']
    merged_df.loc[mask, 'Works'] = True
    
    # Drop the ML-specific columns
    merged_df = merged_df.drop(columns=['ML_Predicted_New_URL', 'ML_Prediction_Works'])
    
    # Save the consolidated dataset
    merged_df.to_csv(output_csv, index=False)
    print(f"Consolidated results saved to {output_csv}")
    
    # Print statistics
    total_updated = mask.sum()
    print(f"Total records updated: {total_updated}")
    print(f"Percentage of previously non-working URLs now working: {total_updated / (original_df['Works'] == False).sum():.2%}")

# Usage
consolidate_results('db.csv', 'ml_inferred_urls.csv', 'consolidated_db.csv')