In [0]:
# biblioteki
import pandas as pd
import numpy as np
import re

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import FunctionTransformer

In [0]:
# dane
url = 'https://raw.githubusercontent.com/dk1000/Warsztaty_Jaroszewicz/master/train.tsv'
train = pd.read_csv(url, sep="\t")
train = train.fillna(" ")

url1 = "https://raw.githubusercontent.com/dk1000/Warsztaty_Jaroszewicz/master/test_noy.tsv"
data = pd.read_csv(url1, sep="\t")
data = data.fillna(" ")

In [0]:
def extract_text_features(s):
  s = s.astype("str")
  n = s.str.len().values
  n_w = s.str.split().str.len()
  avg_w_len = n.astype(float)/n_w
  return np.column_stack([n, n_w, avg_w_len])

In [0]:
Y = train.iloc[:,0]
label = Y == "pants-fire"

In [0]:
## dodanie zmiennych na train


average = []
for line in train["statement"]:
  words = line.split()
  average.append(sum(len(word) for word in words) / len(words))

train["average"] = average


###

number = []
for line in train["statement"]:
  words = line.split()
  number.append(len(words))

train["number"] = number

###

user_input = "million"

million = []
for line in train["statement"]:
    if user_input in line.split():
        million.append(1)
    else:
        million.append(0)

train["million"] = million

###

user_input = "percent"

percent = []
for line in train["statement"]:
    if user_input in line.split():
        percent.append(1)
    else:
        percent.append(0)

train["percent"] = percent

###

user_input = "Obama"

Obama = []
for line in train["statement"]:
    if user_input in line.split():
        Obama.append(1)
    else:
        Obama.append(0)

train["Obama"] = Obama

###


user_input = "health"

health = []
for line in train["statement"]:
    if user_input in line.split():
        health.append(1)
    else:
        health.append(0)

train["health"] = health

###

user_input = "more"

more = []
for line in train["statement"]:
    if user_input in line.split():
        more.append(1)
    else:
        more.append(0)

train["more"] = more

###


user_input = "tax"

tax = []
for line in train["statement"]:
    if user_input in line.split():
        tax.append(1)
    else:
        tax.append(0)

train["tax"] = tax


###


user_input = "health"

subject_health = []
for line in train["subject"]:
    if user_input in line.split():
        subject_health.append(1)
    else:
        subject_health.append(0)

train["subject_health"] = subject_health


###


user_input = "economy"

subject_economy = []
for line in train["subject"]:
    if user_input in line.split():
        subject_economy.append(1)
    else:
        subject_economy.append(0)

train["subject_economy"] = subject_economy




In [0]:
### dodanie zmiennych na test_noy

average = []
for line in data["statement"]:
  words = line.split()
  average.append(sum(len(word) for word in words) / len(words))

data["average"] = average


###

number = []
for line in data["statement"]:
  words = line.split()
  number.append(len(words))

data["number"] = number

###

user_input = "million"

million = []
for line in data["statement"]:
    if user_input in line.split():
        million.append(1)
    else:
        million.append(0)

data["million"] = million

###

user_input = "percent"

percent = []
for line in data["statement"]:
    if user_input in line.split():
        percent.append(1)
    else:
        percent.append(0)

data["percent"] = percent

###

user_input = "Obama"

Obama = []
for line in data["statement"]:
    if user_input in line.split():
        Obama.append(1)
    else:
        Obama.append(0)

data["Obama"] = Obama

###

user_input = "health"

health = []
for line in data["statement"]:
    if user_input in line.split():
        health.append(1)
    else:
        health.append(0)

data["health"] = health

###

user_input = "more"

more = []
for line in data["statement"]:
    if user_input in line.split():
        more.append(1)
    else:
        more.append(0)

data["more"] = more

###


user_input = "tax"

tax = []
for line in data["statement"]:
    if user_input in line.split():
        tax.append(1)
    else:
        tax.append(0)

data["tax"] = tax


###


user_input = "health"

subject_health = []
for line in data["subject"]:
    if user_input in line.split():
        subject_health.append(1)
    else:
        subject_health.append(0)

data["subject_health"] = subject_health


###


user_input = "economy"

subject_economy = []
for line in data["subject"]:
    if user_input in line.split():
        subject_economy.append(1)
    else:
        subject_economy.append(0)

data["subject_economy"] = subject_economy

In [0]:
model = LogisticRegression(solver = 'newton-cg', penalty = "l2", C = 0.5, n_jobs = -1)


ct = ColumnTransformer([("statement", TfidfVectorizer(stop_words = "english"), "statement"),
                        ("statement_svd1", Pipeline([("TFIDF", TfidfVectorizer()), ("svd", TruncatedSVD(n_components=1000, n_iter=10))]), "statement"),
                        ("statement_svd2", Pipeline([("TFIDF", TfidfVectorizer()), ("svd", TruncatedSVD(n_components=500, n_iter=10))]), "statement"),
                        ("statement_svd3", Pipeline([("TFIDF", TfidfVectorizer()), ("svd", TruncatedSVD(n_components=100, n_iter=10))]), "statement"),
                        ("funcTrans", FunctionTransformer(func=extract_text_features, validate=False, accept_sparse = True), "statement"),
                        ("party", TfidfVectorizer(), "party"),
                        ("context", TfidfVectorizer(ngram_range = (1,2)), "context"),
                        ("speaker_job", HashingVectorizer(), "speaker_job"),
                        ("speaker", TfidfVectorizer(ngram_range = (1,2)), "speaker"),
                        ("subject", TfidfVectorizer(), "subject"),
                        ("avg_length", "passthrough", ["average"]),
                        ("number_words", "passthrough", ["number"]),
                        ("statement_percent", "passthrough", ["percent"]),
                        ("statement_Obama", "passthrough", ["Obama"]),
                        ("statement_health", "passthrough", ["health"]),
                        ("statement_million", "passthrough", ["million"]),
                        ("statement_tax", "passthrough", ["tax"]),
                        ("subject_health", "passthrough", ["subject_health"]),
                        ("subject_economy", "passthrough", ["subject_economy"])
                       ])

p = Pipeline([("columntransformer", ct), ("logistic", model)])

scores = cross_val_score(p, train, label, cv=10, scoring = 'roc_auc')
print(scores.mean())

0.7591343434118688


In [0]:
### zapisanie wyników

p.fit(train, label)
wynik = p.decision_function(data)
#print(wynik)

In [0]:
np.savetxt('wynik.res', wynik, delimiter = ",")