In [None]:
import pandas as pd
import json

def fix_json(input_file, output_file):
    # Read the JSON file as text
    with open(input_file, 'r') as f:
        json_text = f.read()

    # Split the JSON text into individual objects
    json_objects = json_text.split('}\n')

    # Initialize an empty list to store parsed JSON objects
    data = []

    # Process each JSON object
    for obj in json_objects:
        if obj.strip():  # Check if the object is not empty
            try:
                # Fix single quotes and append to the list
                data.append(json.loads(obj.replace("'", "\"") + '}'))
            except json.JSONDecodeError as e:
                print("Error decoding JSON object:", e)
                print("JSON object causing the error:", obj)

    # Convert the list of JSON objects into a DataFrame
    df = pd.DataFrame(data)

    # Write the DataFrame back to a JSON file with corrected formatting
    df.to_json(output_file, orient='records')

    print("JSON file has been fixed and saved to", output_file)

# Provide the path to your input and output JSON files
input_file = "meta_Automotive.json"
output_file = "fix_meta_Automotive.json"

fix_json(input_file, output_file)


In [None]:
import json

# Function to format JSON objects individually
def format_json_objects(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)

    with open(output_file, 'w') as f:
        for item in data:
            json.dump(item, f)
            f.write('\n')

# Example usage:
input_file = 'fix_meta_Automotive.json'
output_file = 'formatted_meta_Automotive.json'

format_json_objects(input_file, output_file)


In [None]:
import pandas as pd

# Load the ratings data from the CSV file
ratings_path = 'ratings_Automotive.csv'
ratings_data = pd.read_csv(ratings_path)
# Renaming the columns for clarity
ratings_data.columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings_data.head()


In [None]:
def load_large_json(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Use the function to load your JSON metadata
metadata_df = load_large_json('formatted_meta_Automotive.json')

full_data = ratings_data.merge(metadata_df, how='left', left_on='item_id', right_on='asin')

# Drop the 'asin' column if you decide it's redundant
full_data.drop('asin', axis=1, inplace=True)
# Display the first few rows of the dataset and its basic information
full_data.head()

In [None]:
# Check for missing values
missing_values = full_data.isnull().sum()
missing_percentage = (missing_values / len(full_data)) * 100
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
print("\nMissing Values:\n", missing_data)

In [None]:
# Descriptive statistics
print(full_data.describe())

# Check for missing values
print(full_data.isnull().sum())

# Visualize distributions of numerical variables
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram for price
plt.figure(figsize=(10, 6))
sns.histplot(full_data['price'], bins=50, kde=True)
plt.title('Price Distribution')
plt.show()

# Histogram for ratings
plt.figure(figsize=(10, 6))
sns.histplot(full_data['rating'], bins=5, kde=True)
plt.title('Rating Distribution')
plt.show()

# Box plot for ratings
plt.figure(figsize=(10, 6))
sns.boxplot(x='rating', data=full_data)
plt.title('Rating Box Plot')
plt.show()

# Select only the numeric columns for correlation matrix calculation
numeric_cols = full_data.select_dtypes(include=['float64', 'int64']).columns

# Calculate the correlation matrix
corr_matrix = full_data[numeric_cols].corr()

# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()



In [None]:
# Pre-process the data
# Fill missing values
full_data['price'].fillna(full_data['price'].mean(), inplace=True)
full_data['brand'].fillna('Unknown', inplace=True)

# Fill missing values in description, title, imUrl, related
full_data['description'].fillna('No description', inplace=True)
full_data['title'].fillna('No title', inplace=True)
full_data['imUrl'].fillna('No image', inplace=True)
full_data['related'].fillna('No related items', inplace=True)

# Drop the salesRank column due to high percentage of missing values
full_data.drop(columns=['salesRank'], inplace=True)

# Convert list of categories to a single string
def convert_categories(cat):
    if isinstance(cat, list):
        # Flatten the list in case of nested lists
        flat_list = [item for sublist in cat for item in sublist] if any(isinstance(i, list) for i in cat) else cat
        return ' '.join(flat_list)
    elif pd.isna(cat):
        return ''
    else:
        return cat

full_data['categories'] = full_data['categories'].apply(convert_categories)

# Encode categorical variables
full_data['categories'] = full_data['categories'].astype('category').cat.codes
full_data['brand'] = full_data['brand'].astype('category').cat.codes

# Save pre-processed data to a new CSV file for future use
full_data.to_csv('preprocessed_combined_data.csv', index=False)


In [None]:
# Check for missing values
missing_values = full_data.isnull().sum()
missing_percentage = (missing_values / len(full_data)) * 100
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
print("\nMissing Values:\n", missing_data)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# Load pre-processed data
combined_df = pd.read_csv('preprocessed_combined_data.csv')

# Fill missing text with a placeholder
combined_df['title'].fillna('No Title', inplace=True)
combined_df['description'].fillna('No Description', inplace=True)

# Select a subset of the data for testing
combined_df = combined_df.sample(frac=0.1, random_state=42)

# Apply TF-IDF Vectorization separately
tfidf_vectorizer_title = TfidfVectorizer(stop_words='english', max_features=2500)  # Half the features for title
tfidf_vectorizer_description = TfidfVectorizer(stop_words='english', max_features=2500)  # Half for description

title_tfidf_matrix = tfidf_vectorizer_title.fit_transform(combined_df['title'])
description_tfidf_matrix = tfidf_vectorizer_description.fit_transform(combined_df['description'])

# Dimensionality reduction (optional)
svd_title = TruncatedSVD(n_components=50)  # Reduce dimensions for title
svd_description = TruncatedSVD(n_components=50)  # Reduce for description

title_tfidf_reduced = svd_title.fit_transform(title_tfidf_matrix)
description_tfidf_reduced = svd_description.fit_transform(description_tfidf_matrix)

# Combine TF-IDF with Rating Data
title_df = pd.DataFrame(title_tfidf_reduced, index=combined_df.index)
description_df = pd.DataFrame(description_tfidf_reduced, index=combined_df.index)
full_feature_df = pd.concat([title_df, description_df], axis=1)

full_data = pd.concat([combined_df[['user_id', 'item_id', 'rating']], full_feature_df], axis=1)

# Split the data into training and testing sets
train_data, test_data = train_test_split(full_data, test_size=0.2, random_state=42)

# Example model: Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(train_data.drop(['user_id', 'item_id', 'rating'], axis=1), train_data['rating'])

# Predict ratings
predicted_ratings = model.predict(test_data.drop(['user_id', 'item_id', 'rating'], axis=1))

# Evaluation
rmse = mean_squared_error(test_data['rating'], predicted_ratings, squared=False)
mae = mean_absolute_error(test_data['rating'], predicted_ratings)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')


In [None]:
import pandas as pd
# Define a function to load data in chunks
def load_data_in_chunks(file_path, chunk_size=10000):
    chunk_list = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        chunk_list.append(chunk)
    return pd.concat(chunk_list, axis=0)

# Load pre-processed data in chunks
file_path = 'preprocessed_combined_data.csv'
combined_df = load_data_in_chunks(file_path, chunk_size=10000)

# Shuffle the entire dataset
combined_df_subset = combined_df.sample(frac=0.1, random_state=42)



In [None]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Create a pivot table for items and features
item_features = combined_df_subset.pivot_table(index='item_id', columns='user_id', values='rating').fillna(0)

# Convert the pivot table to a sparse matrix
item_features_sparse = csr_matrix(item_features.values)

# Calculate cosine similarity between items
item_similarity = cosine_similarity(item_features_sparse)

# Convert the similarity matrix to a DataFrame for easier handling
item_similarity_df = pd.DataFrame(item_similarity, index=item_features.index, columns=item_features.index)


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Function to predict ratings using item similarity
def predict_ratings_batch(user_id, item_id):
    if item_id in item_features.index:
        similar_items = item_similarity_df.loc[item_id]
        user_ratings = item_features.loc[:, user_id]
        weighted_sum = np.dot(similar_items, user_ratings)
        sum_of_weights = np.sum(similar_items)
        if sum_of_weights > 0:
            return weighted_sum / sum_of_weights
        else:
            return np.nan
    else:
        return np.nan

# Predict ratings for the test subset in batches
batch_size = 1000  
test_df_subset = combined_df_subset.sample(frac=0.2, random_state=42)

predicted_ratings = []
for start in range(0, len(test_df_subset), batch_size):
    end = start + batch_size
    batch = test_df_subset.iloc[start:end]
    batch_predictions = batch.apply(lambda row: predict_ratings_batch(row['user_id'], row['item_id']), axis=1)
    predicted_ratings.extend(batch_predictions)

test_df_subset['predicted_rating'] = predicted_ratings

# Evaluation for content-based filtering
cb_rmse = mean_squared_error(test_df_subset['rating'], test_df_subset['predicted_rating'], squared=False)
cb_mae = mean_absolute_error(test_df_subset['rating'], test_df_subset['predicted_rating'])

print(f'Content-Based Filtering RMSE: {cb_rmse}')
print(f'Content-Based Filtering MAE: {cb_mae}')


In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define a function to load data in chunks
def load_data_in_chunks(file_path, chunk_size=10000):
    chunk_list = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        chunk_list.append(chunk)
    return pd.concat(chunk_list, axis=0)

# Load pre-processed data in chunks
file_path = 'preprocessed_combined_data.csv'
combined_df = load_data_in_chunks(file_path, chunk_size=10000)

# Shuffle the entire dataset
combined_df_subset = combined_df.sample(frac=1, random_state=42)

# Create a pivot table for items and features
item_features = combined_df_subset.pivot_table(index='item_id', columns='user_id', values='rating').fillna(0)

# Convert the pivot table to a sparse matrix
item_features_sparse = csr_matrix(item_features.values)

# Calculate cosine similarity between items
item_similarity = cosine_similarity(item_features_sparse)

# Convert the similarity matrix to a DataFrame for easier handling
item_similarity_df = pd.DataFrame(item_similarity, index=item_features.index, columns=item_features.index)

# Function to predict ratings using item similarity
def predict_ratings_batch(user_id, item_id):
    if item_id in item_features.index:
        similar_items = item_similarity_df.loc[item_id]
        user_ratings = item_features.loc[:, user_id]
        weighted_sum = np.dot(similar_items, user_ratings)
        sum_of_weights = np.sum(similar_items)
        if sum_of_weights > 0:
            return weighted_sum / sum_of_weights
        else:
            return np.nan
    else:
        return np.nan

# Predict ratings for the test subset in batches
batch_size = 1000
test_df_subset = combined_df_subset.sample(frac=0.1, random_state=42)  # Use 10% of the full dataset for testing

predicted_ratings = []
for start in range(0, len(test_df_subset), batch_size):
    end = start + batch_size
    batch = test_df_subset.iloc[start:end]
    batch_predictions = batch.apply(lambda row: predict_ratings_batch(row['user_id'], row['item_id']), axis=1)
    predicted_ratings.extend(batch_predictions)

test_df_subset['predicted_rating'] = predicted_ratings

# Evaluation for content-based filtering
cb_rmse = mean_squared_error(test_df_subset['rating'], test_df_subset['predicted_rating'], squared=False)
cb_mae = mean_absolute_error(test_df_subset['rating'], test_df_subset['predicted_rating'])

print(f'Content-Based Filtering RMSE: {cb_rmse}')
print(f'Content-Based Filtering MAE: {cb_mae}')
