##Packages

In [None]:
#!pip3 -q install accelerate -U
#!pip3 -q install transformers
#!pip3 -q install datasets
#Restart after installing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from transformers import pipeline

# Hugging Face models - Pipeline()

## Sentiment Analysis Model

In [None]:
from transformers import pipeline

In [None]:
senti_model = pipeline(task="sentiment-analysis")

In [None]:
senti_model("This movie is damn good. I loved it")

In [None]:
senti_model("This is a bad phone. The screen and battery are of poor quality.")

## Sentiment Analysis Model-2

In [None]:
Senti_model_2 = pipeline(task="sentiment-analysis",
                         model="cardiffnlp/twitter-roberta-base-sentiment-latest")

In [None]:
Senti_model_2("Over heating issue don't by this product camera was good")

In [None]:
output = Senti_model_2("Over heating issue don't by this product camera was good")
output[0]['label']

In [None]:
Senti_model_2("Waste of money")

In [None]:
Senti_model_2("Good but manufacturing date is of feb 2025, looks like shopkeeper have old stock and he or she through on me to get rid of old stock")

## Prediction on your dataset

In [None]:
import pandas as pd
user_review_data=pd.read_csv("https://raw.githubusercontent.com/giridhar276/Datasets/master/Amazon_Yelp_Reviews/Review_Data.csv")
user_review_data=user_review_data.sample(50)
user_review_data["Review"]

In [None]:
user_review_data["Predicted_Sentiment"] = user_review_data["Review"].apply(lambda x: Senti_model_2(x)[0]["label"])
user_review_data

## Load the model on GPU

In [None]:
Senti_model_2_gpu = pipeline(task="sentiment-analysis",
                         model="cardiffnlp/twitter-roberta-base-sentiment-latest")

In [None]:
user_review_data["Predicted_Sentiment"] = user_review_data["Review"].apply(lambda x: Senti_model_2_gpu(x)[0]["label"])
user_review_data

## Language Translation Model

In [None]:
translator_model = pipeline(task="translation_en_to_fr",
                            model="google-t5/t5-small")
translator_model("My name is Giri")

In [None]:
#Clear the cache in GPU
import torch
torch.cuda.empty_cache()

## Question and Answer Based on a Document

In [None]:
qa_model = pipeline(task="question-answering",
                    model="deepset/roberta-base-squad2")

In [None]:
# If you get any locale related error
'''
import locale
print(locale.getpreferredencoding())

def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
'''

In [None]:
#Importing computer_scientists.txt document from github
#!wget https://raw.githubusercontent.com/giridhar276/Datasets/master/computer_scientists/computer_scientists.txt
document=open("computer_scientists.txt").read()

# if your area of interest is pdf ... get the pdf here
#import pypdf2
#import pdfminer

In [None]:
qa_model({'question':"Who is the first computer programmer?",
          'context':document})

In [None]:
qa_model({'question':"What did Yann LeCun contribute?",
          'context':document})

In [None]:
qa_model({'question':"Who is the father of deep learning?",
          'context':document})

## NER (Name Entity Recognition) Model


In [None]:
ner_model = pipeline(task="ner",
                     model="dslim/bert-base-NER",
                     aggregation_strategy="simple")
#aggregation_strategy ="Simple" ; simplifies the output and makes it easy to read


In [None]:
sample_doc="""
Hello,
  I, John Smith, a member of the Tech Innovators team, would like to schedule a meeting with you,
  Mary Johnson, from the Quantum Solutions group, on Tuesday, February 8th, 2024, at 10:00 AM.
  We can meet at your office in San Francisco or, if more convenient, at the Cafe Bella in New York City.
  Please let me know if this date and time work for you and I am using IP address 3.3.3.3 and using email id giridhar276@gmail.com
"""

In [None]:
entities = ner_model(sample_doc)
print(entities)

In [None]:
# Convert the above output into a dataframe and print it with the entity name
NER_result = pd.DataFrame(entities, columns=["word", "entity_group"])

# Print the DataFrame
print(NER_result)


## Text Summarization Model

In [None]:
summarizer_model = pipeline(task="summarization",
                            model="google/pegasus-xsum")

In [None]:
Book_essay = """
The 7 Habits of Highly Effective People" is a timeless self-help book by Stephen R. Covey that offers a holistic approach to personal and professional effectiveness. The book is a guide to transforming one's life by adopting seven fundamental habits.
Covey's philosophy centers on the idea that true success is achieved by aligning one's values with principles that govern human effectiveness. The first three habits focus on personal development, emphasizing the importance of taking control of one's life, setting clear goals, and prioritizing tasks based on importance rather than urgency.
The next three habits delve into the concept of interdependence, emphasizing the significance of effective communication, cooperation, and collaboration in achieving mutually beneficial outcomes. Covey argues that fostering strong interpersonal relationships and empathetic listening are key to building trust and synergy.
The seventh habit, "Sharpen the Saw," encourages continuous self-renewal and personal growth through physical, mental, emotional, and spiritual well-being.
Throughout the book, Covey provides practical advice and real-life examples to illustrate each habit's application in various aspects of life, from family and work to leadership and community involvement. "The 7 Habits of Highly Effective People" has had a profound impact on individuals seeking personal and professional growth, offering a framework for achieving lasting success and a sense of fulfillment..
"""

In [None]:
print(summarizer_model(Book_essay, max_length=120, min_length=3))

## Text Generation Model

In [None]:
text_generator_model = pipeline(task="text-generation",
                                model="gpt2")

In [None]:
# Generate text starting with the given prompt
text_result = text_generator_model("The best way to start a presentation is")
print(text_result)

# Hugging Face models without pipeline()

## Sentiment Analysis

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

In [None]:
import numpy as np
raw_text = "This is a great book"
encoded_input = tokenizer(raw_text, return_tensors='pt')
output = model(**encoded_input)
logits = output.logits.detach().numpy()
y_pred = np.argmax(logits)
y_pred

In [None]:
#Code for passing multiple examples to the above model

import numpy as np
# Prepare the input texts
texts = [
    "This is a great book",
    "The food was not tasty and it was very cold",
    "The weather is very good today",
]

# Tokenize and encode the input texts
encoded_inputs = tokenizer(texts, padding=True, return_tensors="pt")

# Pass the encoded inputs to the model
outputs = model(**encoded_inputs)

# Get the model's predictions
logits = outputs.logits.detach().cpu().numpy()

# Find the predicted class for each input
predictions = np.argmax(logits, axis=1)

# Print the predictions
print(predictions)


# Finetuning HuggingFace model
Code Explanation- [Click here](https://github.com/venkatareddykonasani/Assorted/blob/main/Fine_tuning_HuggingFace.md)

### Bank Complaints Data

In [None]:
#!wget https://github.com/giridhar276/Datasets/raw/master/Bank_Customer_Complaints/complaints_v2.zip
#!unzip -o complaints_v2.zip
#complaints_data = pd.read_csv("/content/complaints_v2.csv")
#complaints_data.head()


##!curl -LO https://github.com/giridhar276/Datasets/raw/master/Bank_Customer_Complaints/complaints_v2.zip
#!unzip -o complaints_v2.zip


complaints_data = pd.read_csv("complaints_v2.csv")
complaints_data.head(5)

### Use distilbert model without finetunung

In [None]:
# Distil bert model
from transformers import pipeline
distilbert_model = pipeline(task="text-classification",
                            model="distilbert-base-uncased",
                            )

In [None]:
distilbert_model(" mortgage well fargo bank since meet conditi...")[0]['label']


In [None]:
sample_data=complaints_data.sample(100, random_state=42)
sample_data["text"]=sample_data["text"].apply(lambda x: " ".join(x.split()[:350]))
sample_data["bert_predicted"] = sample_data["text"].apply(lambda x: distilbert_model(x)[0]["label"])


In [None]:
sample_data.head()

In [None]:
#Default prediction is not a number LABEL_1, LABEL_0
sample_data["bert_predicted_num"]=sample_data["bert_predicted"].apply(lambda x: x[-1])
sample_data["bert_predicted_num"] = sample_data["bert_predicted_num"].astype(int)
sample_data.head()

### Accuracy of the model without fine-tuning

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(sample_data["label"], sample_data["bert_predicted_num"])
print(cm)
accuracy=cm.diagonal().sum()/cm.sum()
print(accuracy)

## Project - Finetuning the model with our data


In [None]:
#!pip -q install accelerate -U
#!pip -q install transformers[torch]
#!pip -q install datasets

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, ClassLabel, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

In [None]:
#The target variable must be named as "label" - Verify it, before proceeding
print(sample_data.columns)

In [None]:
Sample_data = Dataset.from_pandas(sample_data)
# Split the dataset into training and testing sets
train_test_split = Sample_data.train_test_split(test_size=0.2)  # 80% training, 20% testing
dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})
dataset

### Load the tokenizer

In [None]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Padding
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.add_special_tokens({'pad_token': '[PAD]'} )

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

### Load and Train the model

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                            num_labels=2,
                                                            pad_token_id=tokenizer.eos_token_id) # Adjust num_labels as needed
model

In [None]:
training_args = TrainingArguments(
    output_dir="./results_bert_custom",
    num_train_epochs=1,
    logging_dir="./logs_bert_custom",
    eval_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

# Start training
trainer.train()

In [None]:
# Define the directory where you want to save your model and tokenizer
model_dir = "./distilbert_finetuned"

# Save the model
model.save_pretrained(model_dir)

# Save the tokenizer
tokenizer.save_pretrained(model_dir)

#Save the model with
trainer.save_model('Distilbert_CustomModel_10K')

In [None]:
def make_prediction(text):
  new_complaint=text
  inputs=tokenizer(new_complaint, return_tensors="pt")
  inputs = inputs.to(torch.device("mps"))   # either mps or cpu  or cuda:0
  outputs=model(**inputs)
  predictions=outputs.logits.argmax(-1)
  predictions=predictions.detach().cpu().numpy()
  return(predictions)

sample_data["finetuned_predicted"]=sample_data["text"].apply(lambda x: make_prediction(str(x))[0])
sample_data.sample(10)

In [None]:
from sklearn.metrics import confusion_matrix
# Create the confusion matrix
cm1 = confusion_matrix(sample_data["label"], sample_data["finetuned_predicted"])
print(cm1)
accuracy1=cm1.diagonal().sum()/cm1.sum()
print(accuracy1)

### Loading a pre-built model and making prediction

In [None]:
#Code to donwloading the distilbert model
!gdown --id 1785J3ir19RaZP3ebbFvWUX88PMaBouro -O distilbert_finetuned_V1.zip
!unzip -o -j distilbert_finetuned_V1.zip -d distilbert_finetuned_V1

model_v1 = DistilBertForSequenceClassification.from_pretrained('./distilbert_finetuned_V1')
model_v1.to("mps")

In [None]:
def make_prediction(text):
  new_complaint=text
  inputs=tokenizer(new_complaint, return_tensors="pt")
  inputs = inputs.to(torch.device("mps"))
  outputs=model_v1(**inputs)
  predictions=outputs.logits.argmax(-1)
  predictions=predictions.detach().cpu().numpy()
  return(predictions)


In [None]:
sample_data_large=complaints_data.sample(n=1000, random_state=55)
sample_data_large["finetuned_predicted"]=sample_data_large["text"].apply(lambda x: make_prediction(str(x)[:350])[0])

In [None]:
sample_data_large["finetuned_predicted"]

In [None]:
from sklearn.metrics import confusion_matrix
# Create the confusion matrix
cm1 = confusion_matrix(sample_data_large["label"], sample_data_large["finetuned_predicted"])
print(cm1)
accuracy1=cm1.diagonal().sum()/cm1.sum()
print(accuracy1)

# Saving the Model on HuggingFace hub

In [None]:
!pip install transformers
!pip install huggingface_hub
!pip install -U ipykernel #for executing the commands


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments


In [None]:
!pip3 install gdown
!gdown --id 1785J3ir19RaZP3ebbFvWUX88PMaBouro -O distilbert_finetuned_V1.zip
!unzip -o -j distilbert_finetuned_V1.zip -d distilbert_finetuned_V1

model = DistilBertForSequenceClassification.from_pretrained('./distilbert_finetuned_V1')


In [None]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']="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"

In [None]:
!pip3 install ipywidgets

In [None]:
from huggingface_hub import notebook_login
notebook_login()
#To get Auth token: Profile >> Settings >>Access Token

In [None]:
model.push_to_hub("giridhar276/Bank_distil_bert_10K_v2")

# Loading the model from HuggingFace hub

In [None]:
model=DistilBertForSequenceClassification.from_pretrained("giridhar276/Bank_distil_bert_10K")

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
import pandas as pd
#!wget https://github.com/giridhar276/Datasets/raw/master/Bank_Customer_Complaints/complaints_v2.zip
#!unzip -o complaints_v2.zip
complaints_data = pd.read_csv("./complaints_v2.csv")
list(complaints_data["text"].head())

In [None]:
import torch

In [None]:
complaint="""
payment history missing credit report made mistake put account forbearance without authorization knowledge matter fact automatic payment setup month monthly mortgage paid full noticed issue account marked forbearance credit report tried get new home loan another new bank contacted immediately asked fix error provide letter detail please see asks forbearance issue seemed fixed however credit report payment history missing new bank able approve new loan issue missing payment history contacted time since phone ask thing report payment history transunion fix missing data issue provide letter show account never forbearance payment history past month however waiting week countless email phone call talk multiple supervisor able get either one thing without issue fixed new bank process new loan application therefore need help immediately get fixed
"""

inputs=tokenizer(complaint, return_tensors="pt")
outputs=model(**inputs)
predictions=outputs.logits.argmax(-1)
predictions=predictions.detach().cpu().numpy()
print(predictions)

# Web App Creation

In [None]:
%%writefile requirements.txt
streamlit
numpy
pandas
torch
transformers
huggingface_hub

In [None]:
!pip3 install -r requirements.txt

In [None]:
%%writefile app.py
import streamlit as st
import numpy as np
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('giridhar2761111/Bank_distil_bert_10K')

st.title("Bank Complaints Categorization")
st.write("Sample Complaints are given below")
Sample_Complaints = [
    {"Sentence": "Credit Report - payment history missing credit report made mistake put account forbearance without authorization "},
    {"Sentence": "Retail Related - forwarded message cc sent friday pdt subject final legal payment well fargo well fargo clearly wrong need look actually opened account see court hearing several different government agency "}
]
st.table(Sample_Complaints)
user_input = st.text_input("Enter a complaint:")
button=st.button("Classify")

d={
    0: "Credit reporting",
    1: "Mortgage and Others"
}

if user_input and button:
  inputs=tokenizer(user_input, return_tensors="pt")
  outputs=model(**inputs)
  predictions=outputs.logits.argmax(-1)
  predictions=predictions.detach().cpu().numpy()
  print(predictions)
  st.write("Prediction :" , d[predictions[0]])


In [None]:
!streamlit run app.py & npx localtunnel --port 8501 & curl ipv4.icanhazip.com

#This sometimes doesn't work on Chrome

In [None]:
%%writefile app1.py
import streamlit as st
import pandas as pd

st.title("📄 Excel File Uploader and Viewer")

# Upload Excel file
uploaded_file = st.file_uploader("Choose an Excel file", type=["xlsx", "xls"])

if uploaded_file is not None:
    try:
        # Read the Excel file
        df = pd.read_excel(uploaded_file)
        
        # Show dataframe
        st.success("File uploaded successfully!")
        st.subheader("Preview of the uploaded Excel file:")
        st.dataframe(df)
    except Exception as e:
        st.error(f"Error reading the Excel file: {e}")
else:
    st.info("Please upload an Excel file to proceed.")
