In [63]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = r"C:\Users\DELL\Desktop\Invoice_Merged_file - Copy.xlsx" # Update this with the path to your file
df = pd.read_excel(file_path, sheet_name='Sheet1')

# Data Cleaning
# Convert date columns to datetime format
df['Invoice_Date '] = pd.to_datetime(df['Invoice_Date '])
df['Payment_date'] = pd.to_datetime(df['Payment_date'])

# # Handle missing values
df.dropna(axis=0,inplace=True)


# # Standardize text data
df['DTA_Supplier_State '] = df['DTA_Supplier_State '].str.capitalize()
df['DTA_Supplier_Name '] = df['DTA_Supplier_Name '].str.capitalize()

# # Encode categorical variables
label_encoders = {}
for column in ['Invoice_Type ', 'DTA_Supplier_State ', 'DTA_Supplier_Name ', ]:
     label_encoders[column] = LabelEncoder()
     df[column] = label_encoders[column].fit_transform(df[column])

# # Define feature and target variables for predictive analytics
X = df.drop(columns=['Status',])
y = df['Status'].apply(lambda x: 1 if x == 'DSPF Approved' else 0)  # Binary classification: Approved (1) or Not Approved (0)

# # Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,Invoice_Type,Invoice_Date,Amount_in_INR,DTA_Supplier_Name,DTA_Supplier_State,Payment_date
2407,1,2022-01-29,18000.0,31,15,2022-01-29
2850,1,2020-12-08,32084.0,101,3,2021-01-22
3085,1,2020-05-29,171760.0,65,3,2020-05-29
2995,1,2020-05-06,7500.0,191,3,2020-06-05
1202,0,2021-08-18,-75108.0,183,15,2021-08-18
...,...,...,...,...,...,...
3202,1,2019-10-16,1814.0,169,15,2019-11-15
1097,1,2020-10-01,241695.0,169,15,2020-10-01
1132,1,2021-08-18,71400.0,70,1,2021-10-02
1296,1,2021-12-20,576576.0,162,15,2022-02-03


In [64]:
df.isnull().sum()

Invoice_Type           0
Invoice_Date           0
Amount_in_INR          0
Status                 0
DTA_Supplier_Name      0
DTA_Supplier_State     0
Payment_date           0
dtype: int64

In [65]:
X

Unnamed: 0,Invoice_Type,Invoice_Date,Amount_in_INR,DTA_Supplier_Name,DTA_Supplier_State,Payment_date
0,1,2019-11-16,660.0,145,15,2019-11-28
1,1,2019-11-04,6000.0,156,15,2019-11-28
2,1,2019-11-04,1500.0,156,15,2019-11-28
3,1,2019-11-04,1500.0,156,15,2019-11-28
4,1,2019-11-01,2531288.0,169,15,2019-11-21
...,...,...,...,...,...,...
3216,1,2019-10-23,186841.0,181,15,2019-11-21
3217,1,2019-10-01,36962.0,187,15,2019-10-17
3218,1,2019-10-05,1717.0,178,15,2019-11-15
3219,1,2019-10-14,2060.0,178,15,2019-11-15


In [66]:
y

0       0
1       0
2       0
3       0
4       1
       ..
3216    1
3217    1
3218    1
3219    1
3220    1
Name: Status, Length: 3111, dtype: int64

In [67]:
df['Status'].value_counts()

Status
DSPF Approved                    2873
Material                           52
No Benefit                         50
Duplicate entry                    45
Invoice not received for DSPF      23
material                           22
Pending for working                19
Under query                        16
No Benefit-3rd party invoices       4
Material-Duty paid                  3
No benefit                          2
Banglore Unit                       2
Name: count, dtype: int64

In [68]:
y_train

2407    1
2850    1
3085    1
2995    1
1202    1
       ..
3202    0
1097    1
1132    1
1296    1
862     1
Name: Status, Length: 2488, dtype: int64

In [69]:
X.dtypes

Invoice_Type                    int32
Invoice_Date           datetime64[ns]
Amount_in_INR                 float64
DTA_Supplier_Name               int32
DTA_Supplier_State              int32
Payment_date           datetime64[ns]
dtype: object

In [70]:
X_train.drop(['Invoice_Date ','Payment_date'],inplace=True,axis=1)
X_test.drop(['Invoice_Date ','Payment_date'],inplace=True,axis=1)             

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a RandomForest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.9341894060995185
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.49      0.53        47
           1       0.96      0.97      0.96       576

    accuracy                           0.93       623
   macro avg       0.77      0.73      0.75       623
weighted avg       0.93      0.93      0.93       623



In [78]:
X_test.iloc[:1,:]

Unnamed: 0,Invoice_Type,Amount_in_INR,DTA_Supplier_Name,DTA_Supplier_State
695,1,80870.0,172,15


In [80]:
y_test.iloc[:1]

695    0
Name: Status, dtype: int64

In [82]:
rf_classifier.predict(X_test.iloc[:1,:])

array([0], dtype=int64)

In [86]:
X_test.iloc[4:5,:]

Unnamed: 0,Invoice_Type,Amount_in_INR,DTA_Supplier_Name,DTA_Supplier_State
219,1,3004568.0,96,15


In [85]:
y_test.iloc[4:5]

219    1
Name: Status, dtype: int64

In [87]:
rf_classifier.predict(X_test.iloc[4:5,:])

array([1], dtype=int64)