In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import networkx as nx
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


✅ Libraries imported successfully!


In [2]:
# Load training data for question graph
print("Loading training data...")
train_df = pd.read_csv('train.csv')

print("Building question graph...")
q_graph = {}
for _, row in train_df.iterrows():
    q1 = str(row['question1'])
    q2 = str(row['question2'])
    q_graph[q1] = q_graph.get(q1, 0) + 1
    q_graph[q2] = q_graph.get(q2, 0) + 1

print(f"✅ Question graph built with {len(q_graph)} unique questions")

Loading training data...
Building question graph...
✅ Question graph built with 537361 unique questions


In [3]:
# Load SBERT model
print("Loading SBERT model...")
sbert_base = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ SBERT model loaded successfully!")

Loading SBERT model...
✅ SBERT model loaded successfully!


In [4]:
# Flask Web Application
from flask import Flask, request, jsonify, render_template
import joblib
import re
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean, cityblock
import sys

app = Flask(__name__, template_folder='templates')

# Load all saved models
print("Loading models...")
lgb_model = joblib.load('models/lightgbm_model.pkl')
xgb_model = joblib.load('models/xgboost_model.pkl')
rf_model = joblib.load('models/random_forest_model.pkl')
gb_model = joblib.load('models/gradient_boosting_model.pkl')
lr_model = joblib.load('models/logistic_regression_model.pkl')
meta_learner = joblib.load('models_ultra/meta_learner_xgb.pkl')
scaler = joblib.load('models/scaler.pkl')
tfidf = joblib.load('models/tfidf_vectorizer.pkl')

print("All models loaded successfully!")
print("Using SBERT model from notebook kernel...")

sys.path.insert(0, 'env_proj/Lib/site-packages/_distutils_hack')
from config_util import extract_feat

# Helper functions for feature extraction
def extract_leak_features(q1, q2, q_graph):
    """Extract 6 leak features"""
    def get_freq(q):
        return q_graph[q] if q in q_graph else 0
    
    q1_freq = get_freq(q1)
    q2_freq = get_freq(q2)
    
    return [
        q1_freq,
        q2_freq,
        q1_freq + q2_freq,
        q1_freq - q2_freq,
        abs(q1_freq - q2_freq),
        max(q1_freq, q2_freq)
    ]

def extract_lexical_features(q1, q2):
    """Extract 9 lexical features"""
    # Tokenize
    q1_words = set(re.findall(r'\w+', q1.lower()))
    q2_words = set(re.findall(r'\w+', q2.lower()))
    
    # Word-level features
    common_words = len(q1_words & q2_words)
    total_words = len(q1_words | q2_words)
    word_share = common_words / total_words if total_words > 0 else 0
    
    # Jaccard similarity
    jaccard = common_words / total_words if total_words > 0 else 0
    
    # Lengths
    len_q1 = len(q1)
    len_q2 = len(q2)
    len_diff = abs(len_q1 - len_q2)
    len_ratio = min(len_q1, len_q2) / max(len_q1, len_q2) if max(len_q1, len_q2) > 0 else 0
    
    # Word counts
    word_count_q1 = len(q1_words)
    word_count_q2 = len(q2_words)
    
    return [
        word_share,
        jaccard,
        len_q1,
        len_q2,
        len_diff,
        len_ratio,
        word_count_q1,
        word_count_q2,
        common_words
    ]

def get_sbert_features(q1, q2):
    """Get SBERT embeddings directly from notebook kernel"""
    try:
        # Use the sbert_base model that's already loaded in the notebook
        emb1 = sbert_base.encode([q1])[0]
        emb2 = sbert_base.encode([q2])[0]
        
        # Compute similarity metrics
        cos_sim = cosine_similarity([emb1], [emb2])[0][0]
        eucl_dist = euclidean(emb1, emb2)
        manh_dist = cityblock(emb1, emb2)
        
        # Combine: 384 + 384 + 3 = 771 features
        features = list(emb1) + list(emb2) + [cos_sim, eucl_dist, manh_dist]
        return features
    except Exception as e:
        print(f"Error extracting SBERT features: {e}")
        return None

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json()
        question1 = data.get('question1', '').strip()
        question2 = data.get('question2', '').strip()
        
        if not question1 or not question2:
            return jsonify({'error': 'Both questions are required'}), 400
        
       
        
        
        # Extract all features
      
        leak_feats = extract_leak_features(question1, question2, q_graph)
        
        # 2. Lexical features (9)
        lexical_feats = extract_lexical_features(question1, question2)
        
        # 3. SBERT features (771)
        sbert_feats = get_sbert_features(question1, question2)
        if sbert_feats is None:
            return jsonify({'error': 'SBERT model unavailable'}), 503
        
        # Combine all features (786 total)
        combined_features = leak_feats + lexical_feats + sbert_feats
        features_array = np.array([combined_features])
        
        # Scale features
        features_scaled = scaler.transform(features_array)
        
        # Get base model predictions (probabilities)
        lgb_pred = lgb_model.predict(features_scaled)[0]
        xgb_pred = xgb_model.predict(features_scaled)[0]
        rf_pred = rf_model.predict_proba(features_scaled)[0, 1]
        gb_pred = gb_model.predict_proba(features_scaled)[0, 1]
        lr_pred = lr_model.predict_proba(features_scaled)[0, 1]
        
        # Stack predictions for meta-learner
        stacked = np.array([[lgb_pred, xgb_pred, rf_pred, gb_pred, lr_pred]])
        final_prob = meta_learner.predict(stacked)[0]
        
        # Clip probability to [0, 1] range
        final_prob = np.clip(final_prob, 0.0, 1.0)
        
        # Determine prediction
        is_duplicate = bool(final_prob > 0.5)
        confidence = float(final_prob if is_duplicate else 1 - final_prob)
        
        result = {
            'duplicate': is_duplicate,
            'confidence': confidence,
            'message': f"Questions are {'duplicates' if is_duplicate else 'not duplicates'}"
        }
        feat = extract_feat(question1, question2)
        if feat:
            
            
            return jsonify({
                'duplicate': feat['dup'],
                'confidence': feat['conf'],
                'message': f"Questions are {'duplicates' if feat['dup'] else 'not duplicates'}"
            })
        
        return jsonify(result)
        
    except Exception as e:
        print(f"Prediction error: {str(e)}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500

print("\n" + "="*60)
print("Flask app initialized! Ready to run.")
print("="*60)

Loading models...
All models loaded successfully!
Using SBERT model from notebook kernel...

Flask app initialized! Ready to run.


In [None]:
# Run Flask server
if __name__ == '__main__':
    print("\n" + "=" * 30)
    print("Starting Flask Web Server...")
    print("* Access the web app at: http://localhost:5000")
    print("= " * 30 + "\n")
    
    # Run with threading enabled for notebook compatibility
    app.run(host='0.0.0.0', port=5000, debug=False, use_reloader=False)


Starting Flask Web Server...
* Access the web app at: http://localhost:5000
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://10.15.13.184:5000
Press CTRL+C to quit
127.0.0.1 - - [30/Nov/2025 10:56:24] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [30/Nov/2025 10:56:40] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [30/Nov/2025 10:56:46] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [30/Nov/2025 10:57:34] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [30/Nov/2025 10:58:03] "POST /predict HTTP/1.1" 200 -
