In [None]:
! pip install pandas


In [13]:
import pandas as pd
import numpy as np
import os
from typing import Tuple
from dotenv import load_dotenv
# Text Analytics module
from azure.ai.textanalytics import TextAnalyticsClient
# Azure Key Credentials module
from azure.core.credentials import AzureKeyCredential

load_dotenv()
cog_endpoint = os.getenv("COG_SERVICE_ENDPOINT")
cog_key = os.getenv("COG_SERVICE_KEY")

In [70]:
def get_sentiment(text: str) -> Tuple[str, str, float, float, float]:
    # Instantiate Azure Key Credential using `cog_key` variable
    credential =AzureKeyCredential(cog_key)

    # Instantiate Text Analytics Client using `cog_endpoint` and `credential` variables
    client = TextAnalyticsClient(cog_endpoint, credential)

    # Find out the language of the text
    results =client.detect_language(documents=[text])[0]
    detected_language = results.primary_language

    # Prepare documents with detected language
    documents = [text]

    # Analyze the sentiment of the documents
    results = client.analyze_sentiment(documents, language=detected_language.iso6391_name)
    doc = results[0]
    # Extract results
    sentiment =doc.sentiment
    positive =doc.confidence_scores.positive
    neutral = doc.confidence_scores.neutral
    negative = doc.confidence_scores.negative
    
    return  detected_language.name, sentiment, positive, neutral, negative

## Version user_imput:

In [100]:

text = input("Text to analyze: ")
language, sentiment, positive, neutral, negative = get_sentiment(text)
print(f"Detected {language} text with {sentiment} sentiment")
print(f"({positive * 100}% positive, {neutral * 100}% neutral, {negative * 100}% negative)")

Detected French text with negative sentiment
(12.0% positive, 0.0% neutral, 88.0% negative)


## Version Csv_pandas

In [72]:
df_review = pd.read_csv('reviews.csv') 

In [73]:
az_labels = ['positif','neutre','negatif']
np.argmax([2,34,88])

2

In [93]:
def az_review(text):
    language, sentiment, positive, neutral, negative = get_sentiment(text)
    return [ language, sentiment, positive, neutral, negative]

def apply_from_to(df=pd.DataFrame,col1=str,col2=str):
    df[col2] = 0
    for i in range(len(df)):
        print(i)
        df[col2][i] = az_review(df[col1][i])
    return df

def cut_string(string:str):
    return string[:5100]

def which_az_predict(maliste):
    listresult = maliste[2:]
    idx = np.argmax(listresult)
    out = ['pos','neutre','neg'][idx]
    return out

In [None]:
df_review['reviews_cut'] = df_review['reviews'].apply(cut_string)
df_review = apply_from_to(df_review,'reviews_cut','az_no_clean')

In [76]:
df_review

Unnamed: 0,target,reviews,reviews_cut,az_no_clean
0,neg,"plot : two teen couples go to a church party ,...","plot : two teen couples go to a church party ,...","[English, mixed, 0.17, 0.07, 0.76]"
1,neg,the happy bastard's quick movie review \ndamn ...,the happy bastard's quick movie review \ndamn ...,"[English, mixed, 0.17, 0.07, 0.76]"
2,neg,it is movies like these that make a jaded movi...,it is movies like these that make a jaded movi...,"[English, mixed, 0.23, 0.08, 0.69]"
3,neg,""" quest for camelot "" is warner bros . ' firs...",""" quest for camelot "" is warner bros . ' firs...","[English, mixed, 0.28, 0.04, 0.68]"
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis : a mentally unstable man undergoing ...,"[English, mixed, 0.12, 0.05, 0.83]"
...,...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,wow ! what a movie . \nit's everything a movie...,"[English, mixed, 0.73, 0.02, 0.25]"
1996,pos,"richard gere can be a commanding actor , but h...","richard gere can be a commanding actor , but h...","[English, mixed, 0.22, 0.08, 0.7]"
1997,pos,"glory--starring matthew broderick , denzel was...","glory--starring matthew broderick , denzel was...","[English, mixed, 0.17, 0.04, 0.79]"
1998,pos,steven spielberg's second epic film on world w...,steven spielberg's second epic film on world w...,"[English, mixed, 0.32, 0.04, 0.64]"


In [None]:
import re
def remove_punctuation(oldtext):
    newtext = re.sub(r'[^A-Za-z]+', ' ', oldtext)
    return newtext

df_review['clean'] = df_review['reviews'].apply(remove_punctuation)
df_review['clean'] = df_review['clean'].str.lower()
df_review.loc[df_review['target'] == 'pos', "num_target"] = int(1)
df_review.loc[df_review['target'] == 'neg', "num_target"] = int(0)
df_review["num_target"] = df_review["num_target"].astype(int)
df_review

In [91]:
which_az_predict(df_review['az_no_clean'][0])

'negatif'

In [94]:
df_review['az_predict'] = df_review['az_no_clean'].apply(which_az_predict)

In [96]:
df_review['target'].value_counts()

neg    1000
pos    1000
Name: target, dtype: int64

In [95]:
df_review['az_predict'].value_counts()

neg    1750
pos     250
Name: az_predict, dtype: int64

## Stuff_exploration

In [None]:

# df_review['az_no_clean'] = df_review['reviews'].apply(az_review)

In [47]:
az_review(df_review['reviews'][0])

['mixed', 0.17, 0.07, 0.76]

In [21]:
df_review['az_no_clean'] = 0

In [19]:
df_review['reviews'][0:2].apply(az_review)

0    [English, mixed, 0.17, 0.07, 0.76]
1    [English, mixed, 0.17, 0.07, 0.76]
Name: reviews, dtype: object

In [30]:
df_review

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...
...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...
1996,pos,"richard gere can be a commanding actor , but h..."
1997,pos,"glory--starring matthew broderick , denzel was..."
1998,pos,steven spielberg's second epic film on world w...


In [55]:
print((df_review['reviews'].apply(len) > 5120).sum())

366

In [67]:
(df_review['reviews'].apply(len) > 5120)[10:20]

10    False
11    False
12    False
13     True
14    False
15    False
16    False
17    False
18    False
19    False
Name: reviews, dtype: bool

In [64]:
len(df_review['reviews'][1999])
len(df_review['reviews'][1999][:5100])
len(cut_string(df_review['reviews'][1999]))

5100