In [67]:
import pandas as pd


file_path = r"C:\Users\User\Downloads\customers_data.csv"  
data = pd.read_csv(file_path)


print("Columns before cleaning:", data.columns)
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_')
print("Columns after cleaning:", data.columns)
data = data.drop_duplicates()

if 'company_profit' in data.columns:
    data['company_profit'] = data['company_profit'].fillna(data['company_profit'].mean())
else:
    print("'company_profit' column not found in the dataset.")

if 'company_name' in data.columns:
    data['company_name'] = data['company_name'].fillna('Unknown')

if 'company_name' in data.columns:
    data['company_name'] = data['company_name'].str.replace(r'[^\w\s]', '', regex=True)  
    data['company_name'] = data['company_name'].str.strip() 


if 'address' in data.columns:
    data['address'] = data['address'].str.strip().str.title()

if 'company_profit' in data.columns:
    data['company_profit'] = pd.to_numeric(data['company_profit'], errors='coerce')  

data = data.dropna()

data.to_csv('cleaned_file.csv', index=False)

print(data.head())

Columns before cleaning: Index(['Company_ID', 'Company_Name', 'Company_Profit', 'Address'], dtype='object')
Columns after cleaning: Index(['company_id', 'company_name', 'company_profit', 'address'], dtype='object')
   company_id          company_name  company_profit  \
0         1.0  Tech  Enterprises  1         80701.0   
1         2.0   Global  Partners  2         80511.0   
2         3.0  Quantum Associates 3        110664.0   
3         4.0       Prime Network 4         76400.5   
4         5.0    Elite  Ventures  5         69427.0   

                                             address  
0             Edsa, Barangay 606, Pasig, Philippines  
1  Commonwealth Ave, Barangay 789, Taguig, Philip...  
2       Roxas Blvd, Barangay 505, Pasig, Philippines  
3  Alabang-Zapote Rd, Barangay 202, Taguig, Phili...  
4    Ayala Avenue, Barangay 101, Makati, Philippines  


In [47]:
import pandas as pd


file_path = r"C:\path\to\your\products_data.csv"  
data = pd.read_csv(file_path)

print("Columns before cleaning:", data.columns)

data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_')

print("Columns after cleaning:", data.columns)

data = data.drop_duplicates()

data['product_name'] = data['product_name'].fillna('Unknown')

data['product_price'] = data['product_price'].replace(r'[^\d.]', '', regex=True).astype(float)

print(data['product_price'].head())

data = data.dropna(subset=['product_price'])

data.to_csv('cleaned_products_data.csv', index=False)

print(data.head())


Columns before cleaning: Index(['Product_ID', 'Product_Name', 'Product_Price'], dtype='object')
Columns after cleaning: Index(['product_id', 'product_name', 'product_price'], dtype='object')
0    140000.0
1    168000.0
2    100800.0
3    123200.0
4     84000.0
Name: product_price, dtype: float64
   product_id            product_name  product_price
0         1.0      FinPredictor Suite       140000.0
1         2.0  MarketMinder Analytics       168000.0
2         3.0    TrendWise Forecaster       100800.0
3         4.0  CustomerScope Insights       123200.0
4         5.0     SalesSync Optimizer        84000.0


In [51]:
import pandas as pd

file_path = r"C:\Users\User\Downloads\transactions_data.csv"  
data = pd.read_csv(file_path)

data = data.drop(columns=['Unnamed: 0'], errors='ignore')

data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_')

data['transaction_date'] = pd.to_datetime(data['transaction_date'], format='%d/%m/%Y', errors='coerce') 

data['product_price'] = data['product_price'].replace(r'[^\d.]', '', regex=True).astype(float)
data['total_cost'] = data['total_cost'].replace(r'[^\d.]', '', regex=True).astype(float)

data['quantity'] = pd.to_numeric(data['quantity'], errors='coerce')


data['product_price'] = data['product_price'].fillna(data['product_price'].mean())
data['total_cost'] = data['total_cost'].fillna(data['total_cost'].mean())

data['quantity'] = data['quantity'].fillna(data['quantity'].median()) 

data['product_id'] = data['product_id'].fillna(-1)
data['company_id'] = data['company_id'].fillna(-1)

data = data.dropna(subset=['product_id', 'quantity', 'product_price', 'total_cost'])

data.to_csv('cleaned_transactions_data.csv', index=False)

print(data.head())


   transaction_id  company_id  product_id  quantity transaction_date  \
0             1.0        88.0         6.0      11.0              NaT   
1             2.0        29.0        19.0      16.0              NaT   
2             NaN        28.0        18.0       6.0              NaT   
3             4.0        85.0        12.0      12.0              NaT   
4             5.0        47.0         3.0       8.0       2021-06-07   

   product_price  total_cost  
0  194379.147964   1075200.0  
1   97930.993380   1428000.0  
2  126095.547778    940800.0  
3  134652.802537   1008000.0  
4   99575.609634    705600.0  


In [37]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Use the features (Product_Price, Quantity) and the target ('Total_Cost')
feature_columns = ['Product_Price', 'Quantity']
target = 'Total_Cost'

# Drop rows with missing values in features and target
data = data.dropna(subset=feature_columns + [target])

# Select features (X) and target variable (y)
X = data[feature_columns]
y = data[target]

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R-squared score

print("Mean Squared Error (MSE):", mse)
print("R-squared (R²) score:", r2)

# Display model coefficients and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# Predictions
print("Predicted Total Costs:", y_pred[:10])  # Show first 10 predictions


Mean Squared Error (MSE): 69755767105.9942
R-squared (R²) score: 0.9142358552471196
Coefficients: [1.02933705e+01 1.32173917e+05]
Intercept: -1353834.705719971
Predicted Total Costs: [-163958.65842669  389749.24429542  944584.53892183  428674.25371822
 1420849.38190308 1953659.20231764 1649768.7538117   689625.46342224
 1044233.77235066 1873099.04115201]
