<a href="https://colab.research.google.com/github/chinna-03/slash-mark-basic-task2/blob/main/fake_news_det.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')
val_data = pd.read_csv('/content/drive/MyDrive/valid.csv')

In [4]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

# Step 1: Data Preparation
def load_data(file_path):
    return pd.read_csv(file_path)

def preprocess_data(df):
    df = df.rename(columns={'Statement': 'text', 'Label': 'label'})

    # Ensure 'label' column is converted to string type and mapped to binary
    df['label'] = df['label'].astype(str).str.lower().map({'true': 1, 'false': 0})

    # Drop rows with missing values in 'text' or 'label'
    df = df.dropna(subset=['text', 'label'])

    return df

# Step 2: Feature Extraction
def extract_features(train_df, test_df, valid_df):
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    X_train = vectorizer.fit_transform(train_df['text'])
    X_test = vectorizer.transform(test_df['text'])
    X_valid = vectorizer.transform(valid_df['text'])
    return X_train, X_test, X_valid, vectorizer

# Step 3: Model Training
def train_model(X_train, y_train):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model

# Step 4: Save Artifacts
def save_artifacts(vectorizer, X_train, X_test, X_valid, y_train, y_test, y_valid, model):
    with open('vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)
    pd.to_pickle(X_train, 'X_train.pkl')
    pd.to_pickle(X_test, 'X_test.pkl')
    pd.to_pickle(X_valid, 'X_valid.pkl')
    y_train.to_pickle('y_train.pkl')
    y_test.to_pickle('y_test.pkl')
    y_valid.to_pickle('y_valid.pkl')
    with open('final_model.sav', 'wb') as model_file:
        pickle.dump(model, model_file)

# Step 5: Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')

# Step 6: Making Predictions
def load_model_and_predict(model_path, vectorizer, new_data):
    with open(model_path, 'rb') as model_file:
        model = pickle.load(model_file)
    new_data_transformed = vectorizer.transform(new_data)
    predictions = model.predict(new_data_transformed)
    return predictions

# Main function to run the entire pipeline
def main():
    # Step 1: Load and preprocess data
    train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
    test_data = pd.read_csv('/content/drive/MyDrive/test.csv')
    val_data = pd.read_csv('/content/drive/MyDrive/valid.csv')

    train_df = preprocess_data(train_data)
    test_df = preprocess_data(test_data)
    valid_df = preprocess_data(val_data)

    # Step 2: Feature Extraction
    X_train, X_test, X_valid, vectorizer = extract_features(train_df, test_df, valid_df)
    y_train = train_df['label']
    y_test = test_df['label']
    y_valid = valid_df['label']

    # Step 3: Model Training
    model = train_model(X_train, y_train)

    # Step 4: Save artifacts
    save_artifacts(vectorizer, X_train, X_test, X_valid, y_train, y_test, y_valid, model)

    # Step 5: Evaluation
    evaluate_model(model, X_test, y_test)

    # Step 6: Making Predictions
    predictions = load_model_and_predict('final_model.sav', vectorizer, valid_df['text'])
    print(f'Predictions: {predictions[:10]}')
    print(f'Actual: {y_valid.head(10).values}')

if __name__ == "__main__":
    main()

Accuracy: 0.6029008232065857
Predictions: [1 0 0 1 0 1 1 1 1 0]
Actual: [0. 0. 0. 1. 1. 0. 1. 0. 1. 1.]
