# SENTIMENT ANALYSIS

1. Vader for lexicon and rule based classification
2. RoBERTa for classification based on semantics

In [1]:
!pip install textblob vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
!mkdir input_csvs output_csvs

## VADER

In [1]:
import re
from bs4 import BeautifulSoup

def extract_text_from_html(html_content):
    try:
        soup = BeautifulSoup(html_content, "html.parser")
        text = soup.get_text(separator=" ", strip=True)

        text = re.sub(r'\s+', ' ', text)

        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return None


In [None]:
import os
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def analyze_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(extract_text_from_html(str(text)))

    return vs['compound']

def process_csv_files(input_dir, output_dir, text_column='text'):
    """
    Processes all CSV files in the input directory, adds a 'sentiment' column,
    and saves the modified CSV files to the output directory.

    Args:
        input_dir (str): Path to the directory containing CSV files.
        output_dir (str): Path to the directory to save the modified CSV files.
        text_column (str): Name of the column containing the text to analyze.
    """

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith('.csv'):
            input_filepath = os.path.join(input_dir, filename)
            output_filepath = os.path.join(output_dir, filename)

            try:
                df = pd.read_csv(input_filepath)

                if text_column not in df.columns:
                    print(f"Warning: Column '{text_column}' not found in {filename}. Skipping.")
                    continue

                df['sentiment'] = df[text_column].apply(analyze_sentiment)
                df.to_csv(output_filepath, index=False)
                print(f"Processed and saved: {filename}")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

if __name__ == "__main__":
    input_directory = "input_csvs"
    output_directory = "output_csvs"
    text_column_name = "content"

    process_csv_files(input_directory, output_directory, text_column_name)

## ROBERTA

In [None]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from scipy.special import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the pretrained RoBERTa model
MODEL = 'Cloudy1225/stackoverflow-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def extract_text_from_html(html_content):
    """Extract text from HTML content."""
    try:
        soup = BeautifulSoup(html_content, "html.parser")
        text = soup.get_text(separator=" ", strip=True)
        text = re.sub(r'\s+', ' ', text)
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return None

def preprocess(text):
    """Preprocess text (username and link placeholders)."""
    new_text = []
    for t in text.split(' '):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return ' '.join(new_text).strip()

def analyze_sentiment(text):
    """Analyze sentiment using RoBERTa-based model."""
    try:
        text = extract_text_from_html(str(text))
        print("text to be analyzed:", text)
        if not text:
            return None, None, None, None

        text = preprocess(text)
        # encoded_input = tokenizer(text, return_tensors='pt')
        encoded_input = tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            padding=True,
            max_length=514
        )

        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        negative, neutral, positive = scores[0], scores[1], scores[2]
        sentiment_score = positive - negative

        print(f"Sentiment Score: {sentiment_score}, Negative: {negative}, Neutral: {neutral}, Positive: {positive}")

        return sentiment_score, negative, neutral, positive
    except Exception as e:
        print(f"Error analyzing text: {e}")
        return None, None, None, None

def process_csv_files(input_dir, output_dir, text_column='text'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith('.csv'):
            input_filepath = os.path.join(input_dir, filename)
            output_filepath = os.path.join(output_dir, filename)

            try:
                df = pd.read_csv(input_filepath)

                if text_column not in df.columns:
                    print(f"Warning: Column '{text_column}' not found in {filename}. Skipping.")
                    continue

                results = df[text_column].apply(analyze_sentiment)
                df['sentiment_score'] = results.apply(lambda x: x[0])
                df['negative_weight'] = results.apply(lambda x: x[1])
                df['neutral_weight'] = results.apply(lambda x: x[2])
                df['positive_weight'] = results.apply(lambda x: x[3])

                df.to_csv(output_filepath, index=False)
                print(f"Processed and saved: {filename}")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

if __name__ == "__main__":
    input_directory = "input_csvs"
    output_directory = "output_csvs_roberta"
    text_column_name = "content"

    process_csv_files(input_directory, output_directory, text_column_name)


text to be analyzed: Update: Microsoft now provide virtual machine images for various versions of IE that are ready to use on all of the major OS X virtualisation platforms ( VirtualBox VMWare Fusion and Parallels ). Download the appropriate image from: http://www.modern.ie/en-us/virtualization-tools#downloads On an Intel based Mac you can run Windows within a virtual machine. You will need one virtual machine for each version of IE you want to test against. The instructions below include free and legal virtualisation software and Windows disk images. Download some virtual machine software. The developer disk images we're going to use are will work with either VMWare Fusion or Sun Virtual Box . VMWare has more features but costs $80 Virtual Box on the other hand is more basic but is free for most users (see Virtual Box licensing FAQ for details). Download the IE developer disk images which are free from Microsoft: http://www.microsoft.com/downloads/... Extract the disk images using cab


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(html_content, "html.parser")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sentiment Score: 5.1092778448946774e-05, Negative: 0.00010639100219123065, Neutral: 0.9997361302375793, Positive: 0.00015748378064017743
text to be analyzed: I've to test an existing android app that uses Google maps. I was able to run it in an emulator AVD with Google APIs (API 17). I have to test it in high resolution devices like Samsung Galaxy S4 (1920x1080) etc. Then I updated the SDK and now all AVDs (both old & newly created) with Google APIs behave the same way: It boots but right after booting a segmentation fault (core dumped) occurs and it crashes. DDMS monitor shows that the last error is related with EGL. I'm running the emulators from a linux machine(Fedora kernel 2.6.35) with 2GB RAM and an AMD Turion64 1.6Ghz processor. Tried running it with APIs 16 17 and 18 still the same error occurs. AVDs with Normal SDKs work fine issue is with Google APIs which enable Play Map services etc. Tried emulator-arm also bu


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(html_content, "html.parser")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sentiment Score: -0.9910133481025696, Negative: 0.9911419153213501, Neutral: 0.008729521185159683, Positive: 0.00012858194531872869
text to be analyzed: I still didn't try Rails 3, so my answer will be more general. And I don't know basic auth module for NginX. If your team is connected localy, then you can create server accessible from local network only. If you need access via Internet, then you can hide it behind VPN. You can set access to site only through local ip and give ssh access to anybody who need it. It is easy to create socks proxy via ssh (in linux: ssh -D 8080 user@yourserver.com; where 8080 is port number, then set socks proxy in browser and you can lunch yoursever.com:3000). I think that NginX allows you to set allowed IP's and deny other - so you can use it also for access restriction. And also you can temporarly add to ApplicationController before_filter :require_login :), so only login page will be ava


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(html_content, "html.parser")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sentiment Score: -0.7676721215248108, Negative: 0.7680231332778931, Neutral: 0.23162586987018585, Positive: 0.0003510129463393241
text to be analyzed: You probably mean authenticate against Active directory in which Exchange server is integrated.Then you can use LDAP: Authenticating in PHP using LDAP through Active Directory
Sentiment Score: -2.6797366444952786e-05, Negative: 0.0001331670064246282, Neutral: 0.9997604489326477, Positive: 0.00010636963997967541
text to be analyzed: I would suggest: %s/\s\zs[0:]*\ze\d// I tried it on your example and it seems to do what you want.
Sentiment Score: 0.00011817274935310706, Negative: 8.729451656108722e-05, Neutral: 0.9997072815895081, Positive: 0.00020546726591419429
text to be analyzed: I keep getting prompted for a user name and password when I try to access the following URL: http://localhost:8080/manager/html App Server: Tomcat 6.0.35 Browser: Firefox 3.6.18 OS: Centos 5.5 C