Skip to content

Commit

Permalink
Merge branch '__rultor'
Browse files Browse the repository at this point in the history
  • Loading branch information
rultor committed Apr 26, 2024
2 parents 1d4476a + 4cab8dd commit 1d46ae9
Show file tree
Hide file tree
Showing 7 changed files with 235 additions and 18 deletions.
4 changes: 4 additions & 0 deletions model/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from predictor import Predictor

prediction = Predictor().predict("invan/spring3-mvc-maven-json-hello-world")
print(prediction)
14 changes: 9 additions & 5 deletions model/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,17 @@
"""
from transformers import pipeline


class Predictor:
def __init__(self, text):
self.text = text
def __init__(self, ref="h1alexbel/github-samples-classifier"):
self.ref = ref

def predict(self):
def predict(self, text):
classifier = pipeline(
"sentiment-analysis",
model="h1alexbel/github-samples-classifier"
model=self.ref
)
return classifier(self.text)
return classifier(text)

def model(self):
return self.ref
24 changes: 11 additions & 13 deletions objects/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

import typer

from .filter_pipe import FilterPipe
from model.predictor import Predictor
from .dataset import Dataset
from objects import NAME, VERSION
from .pre_filter import PreFilter
Expand All @@ -44,7 +46,7 @@ def _version_callback(value: bool) -> None:
@app.command()
def filter(
repositories: str = typer.Option(
..., "--repositories", help="Path to the repositories CSV file."
..., "--repositories", help="Path to the input repositories CSV file."
),
out: str = typer.Option(
..., "--out", help="Path to the output CSV file."
Expand All @@ -53,18 +55,14 @@ def filter(
"""
Filter repositories.
"""
typer.echo(f"Filtering {repositories}...")
# @todo #10:45min Filter the repositories using general-like interface.
# We should execute filtering here using some general interface, so
# it would easy to use either LLM or ML filters.
# @todo #19:45 Implement chain of csv transformation.
# We should implement a transformation chain of csv files.
# For now we are just adding separate objects to this script.
# Let's create a class (let's call it `train` or `pipeline`) that would
# execute all transformation one by one.
PreFilter(out).prepare()
Dataset(Input(repositories).copy()).formulate()
typer.echo(f"Filtering completed. Saving output to {out}...")
# @todo #18:30min Find effective way for processing readme.
# For now we are not processing readme because of
# <a href="https://github.com/h1alexbel/samples-filter/issues/39">this</a>.
# We need to find actual way to process readme too since it can be crucial
# data as model input. Let's study papers, outlined
# <a href="https://github.com/yegor256/cam/issues/227#issue-2200080559">here</a>
# first, rethink it and try to implement here.
FilterPipe(repositories, out, Predictor(), typer).apply()


# Run it.
Expand Down
40 changes: 40 additions & 0 deletions objects/feed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# The MIT License (MIT)
#
# Copyright (c) 2024 Aliaksei Bialiauski
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import csv

"""
Feed.
"""


class Feed:
def __init__(self, file):
self.file = file

def read(self):
with open(self.file, "r") as input:
reader = csv.DictReader(input)
feed = []
for row in reader:
name = row["full_name"]
feed.append(name)
return feed
77 changes: 77 additions & 0 deletions objects/filter_pipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# The MIT License (MIT)
#
# Copyright (c) 2024 Aliaksei Bialiauski
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import csv

from .feed import Feed
from .text_prediction import TextPrediction

"""
Filter pipe.
"""


class FilterPipe:
def __init__(self, repos, output, predictor, typer):
self.repos = repos
self.output = output
self.predictor = predictor
self.typer = typer

# @todo #18:60min Create integration test case for filter_pipe.py.
# We should create some sort of integration test that checks filtering
# together with model prediction, files creation and other things happening
# in #apply(). Don't forget to remove this puzzle.
def apply(self):
self.typer.echo(f"Filtering {self.repos}...")
feed = Feed(self.repos).read()
with open("predictions.csv", "w") as predictions:
writer = csv.DictWriter(
predictions,
fieldnames=["candidate", "prediction", "model"]
)
writer.writeheader()
samples = []
self.typer.echo(
f"Predicting... (all predictions will be saved into {predictions.name})"
)
for candidate in feed:
prediction = self.predictor.predict(candidate)
log = {
"candidate": candidate,
"prediction": prediction,
"model": self.predictor.model()
}
writer.writerow(log)
answer = TextPrediction(prediction).as_text()
self.typer.echo(f"{candidate} classified as {answer}")
if answer == "sample":
samples.append(candidate)
self.typer.echo(f"found {len(samples)} samples")
with open(self.repos, "r") as source, open(self.output, "w") as target:
reader = csv.DictReader(source)
writer = csv.DictWriter(target, fieldnames=reader.fieldnames)
writer.writeheader()
for row in reader:
identifier = row["full_name"]
if identifier not in samples:
writer.writerow(row)
self.typer.echo(f"Filtering completed. Output saved to {self.output}")
37 changes: 37 additions & 0 deletions objects/text_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# The MIT License (MIT)
#
# Copyright (c) 2024 Aliaksei Bialiauski
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
Model prediction in simple text format.
"""


class TextPrediction:
def __init__(self, pred):
self.pred = pred

def as_text(self):
if self.pred[0]["label"] == "POSITIVE":
label = "sample"
else:
label = "real"
return label
57 changes: 57 additions & 0 deletions tests/test_text_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# The MIT License (MIT)
#
# Copyright (c) 2024 Aliaksei Bialiauski
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
Test case for TextOut.
"""
import unittest

from objects.text_prediction import TextPrediction


class TestTextOut(unittest.TestCase):

def test_prints_as_real(self):
prediction = [{'label': 'NEGATIVE', 'score': 1.0}]
text = TextPrediction(prediction).as_text()
expected = "real"
self.assertEqual(
text,
expected,
f"output '{text}' for '{prediction}' does not match with expected {expected}"
)

def test_prints_as_sample(self):
prediction = [{'label': 'POSITIVE', 'score': 1.0}]
text = TextPrediction(prediction).as_text()
expected = "sample"
self.assertEqual(
text,
expected,
f"output '{text}' for '{prediction}' does not match with expected {expected}"
)

def test_raises_error_on_invalid_input(self):
with self.assertRaises(
TypeError,
):
TextPrediction("<invalid input>").as_text()

4 comments on commit 1d46ae9

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 1d46ae9 Apr 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 10-1fb8be50 disappeared from objects/cli.py), that's why I closed #18. Please, remember that the puzzle was not necessarily removed in this particular commit. Maybe it happened earlier, but we discovered this fact only now.

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 1d46ae9 Apr 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 19-b3973cba disappeared from objects/cli.py), that's why I closed #22. Please, remember that the puzzle was not necessarily removed in this particular commit. Maybe it happened earlier, but we discovered this fact only now.

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 1d46ae9 Apr 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 18-d3f80dfc discovered in objects/cli.py) and submitted as #62. Please, remember that the puzzle was not necessarily added in this particular commit. Maybe it was added earlier, but we discovered it only now.

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on 1d46ae9 Apr 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 18-1f7b4ec1 discovered in objects/filter_pipe.py) and submitted as #63. Please, remember that the puzzle was not necessarily added in this particular commit. Maybe it was added earlier, but we discovered it only now.

Please sign in to comment.