In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# !pip install transformers torch librosa numpy pandas matplotlib


In [None]:
# !pip install gradio

In [25]:
import os
import numpy as np
import librosa
import torch
import matplotlib.pyplot as plt
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, DistilBertTokenizer, DistilBertForSequenceClassification
import gradio as gr
import logging
import traceback


In [26]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [27]:
# Load models(fine-tuned)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

try:
    wav2vec2_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
    wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
    distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
    distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english').to(device)
    logger.info("Models loaded successfully")
except Exception as e:
    logger.error(f"Error loading models: {str(e)}")
    raise


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2

In [28]:
# Define audio processing functions
def speech_to_text(audio):
    try:
        input_values = wav2vec2_tokenizer(audio, return_tensors="pt").input_values.to(device)
        logits = wav2vec2_model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = wav2vec2_tokenizer.batch_decode(predicted_ids)[0]
        return transcription
    except Exception as e:
        logger.error(f"Error in speech_to_text: {str(e)}")
        return f"Error in speech_to_text: {str(e)}\n{traceback.format_exc()}"

def analyze_sentiment(text):
    try:
        inputs = distilbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = distilbert_model(**inputs)
        sentiment = torch.argmax(outputs.logits, dim=1)
        sentiment_label = "Positive" if sentiment.item() == 1 else "Negative"
        return sentiment_label
    except Exception as e:
        logger.error(f"Error in analyze_sentiment: {str(e)}")
        return f"Error in analyze_sentiment: {str(e)}\n{traceback.format_exc()}"

def extract_audio_features(audio):
    try:
        # Calculate pitches
        pitches, _ = librosa.piptrack(y=audio, sr=22050)
        pitch_values = pitches[pitches > 0]  # Get non-zero pitch values
        pitch = np.mean(pitch_values) if pitch_values.size > 0 else 0  # Calculate mean pitch

        # Calculate tempo
        tempo, _ = librosa.beat.beat_track(y=audio, sr=22050)

        # Calculate spectral centroid (tone)
        spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=22050)
        tone = np.mean(spectral_centroids) if spectral_centroids.size > 0 else 0

        return float(pitch), float(tempo), float(tone)  # Ensure these are floats
    except Exception as e:
        logger.error(f"Error in extract_audio_features: {str(e)}")
        return 0.0, 0.0, 0.0  # Return zeros as floats on error




def process_audio(audio, sample_rate):
    try:
        logger.info(f"Processing audio with sample rate: {sample_rate}")

        # Convert audio to float32 if it's not already
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)

        # Normalize audio to be in the range [-1.0, 1.0]
        audio = audio / np.max(np.abs(audio))

        # Resample audio to 16kHz for Wav2Vec2
        audio_16k = librosa.resample(y=audio, orig_sr=sample_rate, target_sr=16000)

        # Speech to text
        transcription = speech_to_text(audio_16k)
        logger.info(f"Transcription: {transcription}")

        # Sentiment analysis
        sentiment = analyze_sentiment(transcription)
        logger.info(f"Sentiment: {sentiment}")

        # Extract audio features
        pitch, tempo, tone = extract_audio_features(audio_16k)
        logger.info(f"Audio features - Pitch: {pitch}, Tempo: {tempo}, Tone: {tone}")

        return transcription, sentiment, pitch, tempo, tone
    except Exception as e:
        logger.error(f"Error in process_audio: {str(e)}")
        return f"Error in process_audio: {str(e)}\n{traceback.format_exc()}", "Error", 0, 0, 0


In [29]:
# Define visualization functions
def plot_waveform(audio):
    plt.figure(figsize=(10, 4))
    plt.plot(audio)
    plt.title('Audio Waveform')
    plt.xlabel('Sample')
    plt.ylabel('Amplitude')
    plt.grid()
    plt.close()
    return plt

def plot_features(pitch, tempo, tone):
    labels = ['Pitch', 'Tempo', 'Tone']
    values = [pitch, tempo, tone]

    # Ensure values are all floats for the bar plot
    logger.info(f"Plotting features with values: {values} (Types: {[type(v) for v in values]})")

    plt.figure(figsize=(10, 4))
    plt.bar(labels, values, color=['blue', 'orange', 'green'])
    plt.title('Audio Features')
    plt.ylabel('Value')
    plt.ylim(0, max(values) + 1)  # Set appropriate y-limits
    plt.close()
    return plt




In [30]:
def audio_sentiment_analyzer(audio):
    if audio is None:
        logger.warning("No audio received")
        return "No audio recorded", "N/A", None, None

    try:
        sr, audio = audio
        logger.info(f"Received audio with sample rate: {sr} and shape: {audio.shape}")

        transcription, sentiment, pitch, tempo, tone = process_audio(audio, sr)

        # Debugging: Log pitch, tempo, tone types and values
        logger.info(f"Pitch: {pitch} (Type: {type(pitch)})")
        logger.info(f"Tempo: {tempo} (Type: {type(tempo)})")
        logger.info(f"Tone: {tone} (Type: {type(tone)})")

        # Create visualizations
        fig_waveform = plot_waveform(audio)
        fig_features = plot_features(pitch, tempo, tone)

        return transcription, sentiment, fig_waveform, fig_features
    except Exception as e:
        error_msg = f"Error in audio_sentiment_analyzer: {str(e)}\n{traceback.format_exc()}"
        logger.error(error_msg)
        return error_msg, error_msg, None, None


In [31]:
# Create Gradio interface
iface = gr.Interface(
    fn=audio_sentiment_analyzer,
    inputs=gr.Audio(type="numpy"),  # Adjusted the input here
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="Sentiment"),
        gr.Plot(label="Audio Waveform"),
        gr.Plot(label="Audio Features")
    ],
    title="Audio Sentiment Classifier",
    description="Record audio to analyze its sentiment and transcribe the speech.",
    theme="huggingface",
    allow_flagging="never"
)



Sorry, we can't find the page you are looking for.


In [32]:
# Launch Gradio interface
# Click the link, record using the browser, submit
iface.launch(debug=True, share=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1642ec8f7fc651af24.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  return float(pitch), float(tempo), float(tone)  # Ensure these are floats


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://1642ec8f7fc651af24.gradio.live




# Push to github


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
%cd /content/drive/MyDrive/ai_app_development


In [None]:
!git init

In [15]:
!git config --global user.email "mohan_gi@hotmail.com"
!git config --global user.name "girimohan"

In [None]:
!git add .
!git commit -m "Initial commit with project files"

In [None]:
# Push the changes again
!git push -f origin main